在这个地方有公示:http://rcaj.nanjing.gov.cn/zhzbweb/m.html
不过它好像是每季度公示一次,所以可以简单写点代码。
import requests
columns = ['corpName', 'label', 'publicityBatch', 'publicityYear', 'subsideWay', 'talentLevel', 'userName']
def get_announcement_list():
url = 'http://rcaj.nanjing.gov.cn/zhzbweb/rcaj/tRcajRygs'
data = {
'name': '人才安居名单公示',
'pageNo': 1,
'pageSize': 500
}
r = requests.post(url, data=data)
return r.json()['list']
def get_talent_list(id, pageNo):
qurl = 'https://m.mynj.cn:11109/zhzbapp/a/mobile/rygs/list'
qdata = {
'rygsId': id,
'pageNo': pageNo,
'userName': ''
}
r = requests.post(qurl,qdata)
return r.json()
announcement_list = get_announcement_list()
for announcement in announcement_list:
id = announcement['id']
total_page = get_talent_list(id, 0)['totalPage']
for page in range(1, total_page):
talent_list = get_talent_list(id, page)['list']
for talent in talent_list:
item = {c:talent[c] for c in columns}
print(item)
十几分钟写出来的,不咋美观,效率也不高,用scrapy框架效率会高很多。
懒得改了。
不过存储方法肯定不能用print直接输出的,我是新建了个数据库来存。
import pymysql
connection = pymysql.Connection(host='localhost',port=3306,user='root',password='password',db='mynj')
cursor = connection.cursor()
def insert_item(item):
columns = ','.join(item.keys())
values = ','.join(['"{}"'.format(str(x).replace('\"','\'')) if x!=None else 'NULL' for x in item.values()])
sql = f'insert into talent({columns}) values({values})'
cursor.execute(sql)
connection.commit()
后面改用insert_one就好了。
好像不用scrapy还是太慢了……
import scrapy
import requests
class TalentSpider(scrapy.Spider):
name = 'talent'
allowed_domains = ['mynj.cn', 'nanjing.gov.cn']
columns = ['corpName', 'label', 'publicityBatch', 'publicityYear', 'subsideWay', 'talentLevel', 'userName']
def get_announcement_list(self):
url = 'http://rcaj.nanjing.gov.cn/zhzbweb/rcaj/tRcajRygs'
data = {
'name': '人才安居名单公示',
'pageNo': 1,
'pageSize': 500
}
r = requests.post(url, data=data)
return r.json()['list']
def get_talent_list(self, id, pageNo):
qurl = 'https://m.mynj.cn:11109/zhzbapp/a/mobile/rygs/list'
qdata = {
'rygsId': id,
'pageNo': pageNo,
'userName': ''
}
r = requests.post(qurl,qdata)
return r.json()
def start_requests(self):
announcement_list = self.get_announcement_list()
for announcement in announcement_list:
id = announcement['id']
total_page = self.get_talent_list(id, 0)['totalPage']
for page in range(1, total_page):
qurl = 'https://m.mynj.cn:11109/zhzbapp/a/mobile/rygs/list'
qdata = {
'rygsId': str(id),
'pageNo': str(page),
'userName': ''
}
yield scrapy.FormRequest(qurl, formdata=qdata, callback=self.parse)
def parse(self, response):
talent_list = response.json()['list']
for talent in talent_list:
item = {c:talent[c] for c in self.columns}
yield(item)
这样吧



Comments | NOTHING