很奇妙,竟然又能用到我许久没用过的爬虫。不过是个很小的工程,就不用框架,也不费神反爬了,几行代码已经可以满足我的需求。
网址:http://www.landnj.cn/LandBargainInfo.aspx
成交信息列表

Yaodo·2025-04-18·94 次阅读
很奇妙,竟然又能用到我许久没用过的爬虫。不过是个很小的工程,就不用框架,也不费神反爬了,几行代码已经可以满足我的需求。
网址:http://www.landnj.cn/LandBargainInfo.aspx
成交信息列表

成交信息

效果

代码
import requests
import re
from lxml import etree
def format_text(element):
return re.sub('\s+','',''.join(element))
def update_li(tree, li):
a_list = tree.xpath('//a[contains(@href, "LandInfo.aspx?Id=")]')
for a in a_list:
url1 = 'http://www.landnj.cn/{}'.format(a.xpath('./@href')[0])
r1 = r = requests.get(url1)
tree1 = etree.HTML(r1.text)
table = tree1.xpath('//div[@class="page_main"]//table')[2].xpath('.//table')[1]
item = {"url": url1,
"title": format_text(tree1.xpath('//*[@id="ctl00_ContentPlaceHolder1_lblLandCode2"]//text()')),
"地区": format_text(a.xpath('../../following-sibling::td[1]//text()'))}
trs = table.xpath('.//tr')
for tr in trs:
tds = tr.xpath('.//td')
for i in range(0,len(tds),2):
key = format_text(tds[i].xpath('.//text()'))
value = format_text(tds[i+1].xpath('.//text()'))
item.update({key: value})
li.append(item)
return li
def get_page_data(tree,page):
data = {"__EVENTTARGET":"ctl00$ContentPlaceHolder1$AspNetPager1",
"__EVENTARGUMENT":page}
input_list = tree.xpath('.//input[@type="hidden"][contains(@name,"__")]')
for data_i in range(len(input_list)):
data.update({input_list[data_i].xpath('.//@id')[0]:input_list[data_i].xpath('.//@value')[0]})
return data
li = []
url = 'http://www.landnj.cn/LandBargainInfo.aspx?type=0&area=&key=&sdate=&edate='
r = requests.get(url)
tree = etree.HTML(r.text)
li = update_li(tree, li)
for page in range(2,5):
r = requests.post(url, data=get_page_data(tree,page))
tree = etree.HTML(r.text)
li = update_li(tree, li)
print(len(li))
print(li[0])
重点
Comments | NOTHING