很奇妙,竟然又能用到我许久没用过的爬虫。不过是个很小的工程,就不用框架,也不费神反爬了,几行代码已经可以满足我的需求。

网址:http://www.landnj.cn/LandBargainInfo.aspx

成交信息列表

成交信息

效果

代码

import requests
import re
from lxml import etree

def format_text(element):
    return re.sub('\s+','',''.join(element))

def update_li(tree, li):
    a_list = tree.xpath('//a[contains(@href, "LandInfo.aspx?Id=")]')
    for a in a_list:
        url1 = 'http://www.landnj.cn/{}'.format(a.xpath('./@href')[0])
        r1 = r = requests.get(url1)
        tree1 = etree.HTML(r1.text)
        table = tree1.xpath('//div[@class="page_main"]//table')[2].xpath('.//table')[1]
        item = {"url": url1,
               "title": format_text(tree1.xpath('//*[@id="ctl00_ContentPlaceHolder1_lblLandCode2"]//text()')),
               "地区": format_text(a.xpath('../../following-sibling::td[1]//text()'))}
        trs = table.xpath('.//tr')
        for tr in trs:
            tds = tr.xpath('.//td')
            for i in range(0,len(tds),2):
                key = format_text(tds[i].xpath('.//text()'))
                value = format_text(tds[i+1].xpath('.//text()'))
                item.update({key: value})
        li.append(item)
    return li

def get_page_data(tree,page):
    data = {"__EVENTTARGET":"ctl00$ContentPlaceHolder1$AspNetPager1",
           "__EVENTARGUMENT":page}
    input_list = tree.xpath('.//input[@type="hidden"][contains(@name,"__")]')
    for data_i in range(len(input_list)):
        data.update({input_list[data_i].xpath('.//@id')[0]:input_list[data_i].xpath('.//@value')[0]})
    return data

li = []
url = 'http://www.landnj.cn/LandBargainInfo.aspx?type=0&area=&key=&sdate=&edate='
r = requests.get(url)
tree = etree.HTML(r.text)
li = update_li(tree, li)

for page in range(2,5):
    r = requests.post(url, data=get_page_data(tree,page))
    tree = etree.HTML(r.text)
    li = update_li(tree, li)

print(len(li))
print(li[0])

重点

  1. 使用etree而不是BeautifulSoup,配合Chrome开发工具的xpath更符合我习惯;
  2. 使用python模拟asp.net的翻页请求,要不然总是在第一页;
  3. 遇到不会的,问ChatGPT。