• Python爬取拉勾网招聘信息实验
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import json
import time
import requests
import csv


# 1. 创建文件对象
f = open('lgposition.csv', 'w', encoding='utf-8', newline='')
# 2. 基于文件对象构建 csv写入对象
csv_writer = csv.writer(f)
# 3. 构建列表头
csv_writer.writerow(
["公司", "职位名称", "公司简称", "公司规模", "公司行业", "融资", "福利", "职位类型", "第二职位", "第三职位", "技能",
"职位发布时间", "城市", "区域","薪水", "工作年限", "学历", "职位优势"])


def main(pages, position):
# 主url
url1 = 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWo' \
'rds=&suginput='
# ajax请求
url = "https://www.lagou.com/jobs/positionAjax.json?gj=%E5%9C%A8%E6%A0%A1%2F%E5%BA%94%E5%B1%8A&px=default" \
"&needAddtionalResult=false"
# 请求头
headers = {
'Host': 'www.lagou.com',
'accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'https://www.lagou.com/jobs/list_python/p-city_0?px=default&gj=%E5%9C%A8%E6%A0%A1/%E5%'
'BA%94%E5%B1%8A',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.418'
'3.121 Safari/537.36'
}
# 通过data来控制翻页
for page in range(1, pages):
if not str(page) == pages:
print("正在爬取第" + str(page) + "页,共" + str(pages-1) + "页...")
data = {
'first': 'false',
'pn': page,
'kd': position
}
s = requests.Session()
s.get(url=url1, headers=headers, timeout=3)
# cookie = s.cookies
respon = s.post(url=url, headers=headers, data=data, timeout=3)
time.sleep(3)
total = respon.text
results = json.loads(respon.text)['content']['positionResult']['result']
extractpositiondata(results)
if page == pages-1:
print("爬取完毕!")


def extractpositiondata(results):
if len(results):
for result in results:
companyLabelList = result['companyLabelList']
companyLabelLists = ''
if len(companyLabelList):
for i in companyLabelList:
companyLabelLists += i + '、'
skillLable = result['skillLables']
skillLables = ''
if len(skillLable):
for i in skillLable:
skillLables += i + '、'
industryField = result['industryField']
if len(industryField):
if "," in str(industryField):
industryField = industryField.replace(',', '|')
positionAdvantage = result['positionAdvantage']
if len(positionAdvantage):
if "," or ',' in str(positionAdvantage):
positionAdvantage = positionAdvantage.replace(',', '、')
positionAdvantage = positionAdvantage.replace(',', '、')

# 4. 写入csv文件内容
csv_writer.writerow(
[result['companyFullName'],
result['positionName'],
result['companyShortName'],
result['companySize'],
industryField,
result['financeStage'],
companyLabelLists,
result['firstType'],
result['secondType'],
result['thirdType'],
skillLables,
result['createTime'],
result['city'],
result['district'],
result['salary'],
result['workYear'],
result['education'],
positionAdvantage])


if __name__ == '__main__':
main(10, 'python')
# 5. 关闭文件
f.close()