前言

B站是个好去处,可以白嫖很多教程。

最近差不多花了5天肝完了一个scrapy框架的教程(貌似叫Python爬虫从入门到精通-高级篇),老师讲课很有激情,几乎从不拖泥带水,肝完后肯定没有像标题说的那么精通,但是对scrapy框架有了一定的了解那是肯定的,现在去看官方文档也快多了。

视频上传于19年中,但似乎是17年末录制的,其中有些实战项目变化还挺大,好在最后一个综合案例没什么变化,也就是一个房天下全国新房二手房信息的爬虫。

但是我在实际操作的过程中,发现爬取一页二手房信息后,总会被重定向到一个验证码识别页面,比如这样:

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
Redirecting (302) to <GET http://search.fang.com/captcha-bc138d122b3e1ec151/?t=1583752127.773&h=aHR0cHM6Ly9lc2YuZmFuZy5jb20vaG91c2UvaTMyLw%3D%3D&c=cmE6MTEyLjIuMTk4LjE5Nzt4cmk6O3hmZjo%3D> from <GET https://esf.fang.com/house/i32/>
Redirecting (302) to <GET http://search.fang.com/captcha-bc138d122b3e1ec151/?t=1583752127.773&h=aHR0cHM6Ly9lc2YuZmFuZy5jb20vaG91c2UvaTMyLw%3D%3D&c=cmE6MTEyLjIuMTk4LjE5Nzt4cmk6O3hmZjo%3D> from <GET https://esf.fang.com/house/i32/>
Redirecting (302) to <GET http://search.fang.com/captcha-bc138d122b3e1ec151/?t=1583752127.773&h=aHR0cHM6Ly9lc2YuZmFuZy5jb20vaG91c2UvaTMyLw%3D%3D&c=cmE6MTEyLjIuMTk4LjE5Nzt4cmk6O3hmZjo%3D> from <GET https://esf.fang.com/house/i32/>

就算把下载延迟设成10也不能解决这个问题;每次换一个ip代理也不太现实,更何况随机到同样的ip也会被重定向。

我今天几乎为这个苦恼了一整天,上午先弄懂了了怎么用requests库解决这个问题,下午用scrapy框架尝试的时候却一直搞不明白,后来干脆在scrapy框架里引入requests库,用requests来获取cookies。

解决方法

在settings.py中启用下载中间件:

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
DOWNLOADER_MIDDLEWARES = {
'fang.middlewares.FangDownloaderMiddleware': 543,
}
DOWNLOADER_MIDDLEWARES = { 'fang.middlewares.FangDownloaderMiddleware': 543, }
DOWNLOADER_MIDDLEWARES = {
   'fang.middlewares.FangDownloaderMiddleware': 543,
}

在middlewares.py中构建下载中间件:

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
from scrapy import signals
from scrapy import FormRequest, Request
import random, requests
class FangDownloaderMiddleware(object):
headers = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Safari/605.1.15",
}
#处理response
def process_response(self, request, response, spider):
try:
#如果返回的url中包含“captcha”,则运行此程序
if ('captcha' in response.url):
#记录原始的回调函数
callback = request.callback
#保留request的meta中的部分信息
province, city = request.meta.get('info')
#用requests发起一次请求,获取cookies
r = requests.get(response.url, headers=self.headers)
self.cookies = requests.utils.dict_from_cookiejar(r.cookies)
#用requests下载验证码图片
captcha_img_url = response.url.split('?t')[0] + 'captcha-image'
captcha_data = self.get_captcha_data(captcha_img_url)
#用scrapy的FormRequest构建request对象
request = FormRequest(response.url, formdata=captcha_data, meta={'info': (province, city)}, callback=callback)
#将cookies添加到request对象
request.cookies = self.cookies
#返回request
return request
#如果返回的url中不包含“captcha”,则直接返回response
else:
return response
except:
return response
#用requests下载验证码图片
def get_captcha_data(self, url):
r = requests.get(url, headers=self.headers, cookies=self.cookies)
#保存验证码图片
with open('captcha.png', 'wb') as file:
file.write(r.content)
#输入验证码
captcha_code = input('请输入验证码:')
#构建formdata,其中token不知道是个什么值,我就用随机数代替了
captcha_data = {
'code': captcha_code,
'submit': '提交',
'token': str(random.randint(30,80)),
}
#返回formdata
return captcha_data
from scrapy import signals from scrapy import FormRequest, Request import random, requests class FangDownloaderMiddleware(object): headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Safari/605.1.15", } #处理response def process_response(self, request, response, spider): try: #如果返回的url中包含“captcha”,则运行此程序 if ('captcha' in response.url): #记录原始的回调函数 callback = request.callback #保留request的meta中的部分信息 province, city = request.meta.get('info') #用requests发起一次请求,获取cookies r = requests.get(response.url, headers=self.headers) self.cookies = requests.utils.dict_from_cookiejar(r.cookies) #用requests下载验证码图片 captcha_img_url = response.url.split('?t')[0] + 'captcha-image' captcha_data = self.get_captcha_data(captcha_img_url) #用scrapy的FormRequest构建request对象 request = FormRequest(response.url, formdata=captcha_data, meta={'info': (province, city)}, callback=callback) #将cookies添加到request对象 request.cookies = self.cookies #返回request return request #如果返回的url中不包含“captcha”,则直接返回response else: return response except: return response #用requests下载验证码图片 def get_captcha_data(self, url): r = requests.get(url, headers=self.headers, cookies=self.cookies) #保存验证码图片 with open('captcha.png', 'wb') as file: file.write(r.content) #输入验证码 captcha_code = input('请输入验证码:') #构建formdata,其中token不知道是个什么值,我就用随机数代替了 captcha_data = { 'code': captcha_code, 'submit': '提交', 'token': str(random.randint(30,80)), } #返回formdata return captcha_data
from scrapy import signals
from scrapy import FormRequest, Request
import random, requests


class FangDownloaderMiddleware(object):
    headers = {
                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Safari/605.1.15",
            }

    #处理response
    def process_response(self, request, response, spider):
        try:
            #如果返回的url中包含“captcha”,则运行此程序
            if ('captcha' in response.url):
                #记录原始的回调函数
                callback = request.callback
                #保留request的meta中的部分信息
                province, city = request.meta.get('info')
                #用requests发起一次请求,获取cookies
                r = requests.get(response.url, headers=self.headers)
                self.cookies = requests.utils.dict_from_cookiejar(r.cookies)
                #用requests下载验证码图片
                captcha_img_url = response.url.split('?t')[0] + 'captcha-image'
                captcha_data = self.get_captcha_data(captcha_img_url)
                #用scrapy的FormRequest构建request对象
                request = FormRequest(response.url, formdata=captcha_data, meta={'info': (province, city)}, callback=callback)
                #将cookies添加到request对象
                request.cookies = self.cookies
                #返回request
                return request
            #如果返回的url中不包含“captcha”,则直接返回response
            else:
                return response
        except:
            return response


    #用requests下载验证码图片
    def get_captcha_data(self, url):
        r = requests.get(url, headers=self.headers, cookies=self.cookies)
        #保存验证码图片
        with open('captcha.png', 'wb') as file:
            file.write(r.content)
        #输入验证码
        captcha_code = input('请输入验证码:')
        #构建formdata,其中token不知道是个什么值,我就用随机数代替了
        captcha_data = {
            'code': captcha_code,
            'submit': '提交',
            'token': str(random.randint(30,80)),
        }
        #返回formdata
        return captcha_data

再次运行这个爬虫,就可以查看保存下来的验证码图片,然后手动输入验证码继续爬取了。当然,手动输入验证码肯定是比较low的,如果你很精通图像识别的话,可以自己设计算法来解析验证码;如果不精通的话,可以去阿里云的市场随便买一个低价的api接口,比如这个: