Python爬取拉勾网招聘信息实验

Python爬取拉勾网招聘信息实验
import json
import time
import requests
import csv


# 1. 创建文件对象
f = open('lgposition.csv', 'w', encoding='utf-8', newline='')
# 2. 基于文件对象构建 csv写入对象
csv_writer = csv.writer(f)
# 3. 构建列表头
csv_writer.writerow(
        ["公司", "职位名称", "公司简称", "公司规模", "公司行业", "融资", "福利", "职位类型", "第二职位", "第三职位", "技能",
         "职位发布时间", "城市", "区域","薪水", "工作年限", "学历", "职位优势"])


def main(pages, position):
    # 主url
    url1 = 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWo' \
           'rds=&suginput='
    # ajax请求
    url = "https://www.lagou.com/jobs/positionAjax.json?gj=%E5%9C%A8%E6%A0%A1%2F%E5%BA%94%E5%B1%8A&px=default" \
          "&needAddtionalResult=false"
    # 请求头
    headers = {
        'Host': 'www.lagou.com',
        'accept': 'application/json, text/javascript, */*; q=0.01',
        'Referer': 'https://www.lagou.com/jobs/list_python/p-city_0?px=default&gj=%E5%9C%A8%E6%A0%A1/%E5%'
                   'BA%94%E5%B1%8A',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.418'
                      '3.121 Safari/537.36'
    }
    # 通过data来控制翻页
    for page in range(1, pages):
        if not str(page) == pages:
            print("正在爬取第" + str(page) + "页，共" + str(pages-1) + "页...")
        data = {
            'first': 'false',
            'pn': page,
            'kd': position
        }
        s = requests.Session()
        s.get(url=url1, headers=headers, timeout=3)
        # cookie = s.cookies
        respon = s.post(url=url, headers=headers, data=data, timeout=3)
        time.sleep(3)
        total = respon.text
        results = json.loads(respon.text)['content']['positionResult']['result']
        extractpositiondata(results)
        if page == pages-1:
            print("爬取完毕！")


def extractpositiondata(results):
    if len(results):
        for result in results:
            companyLabelList = result['companyLabelList']
            companyLabelLists = ''
            if len(companyLabelList):
                for i in companyLabelList:
                    companyLabelLists += i + '、'
            skillLable = result['skillLables']
            skillLables = ''
            if len(skillLable):
                for i in skillLable:
                    skillLables += i + '、'
            industryField = result['industryField']
            if len(industryField):
                if "," in str(industryField):
                    industryField = industryField.replace(',', '|')
            positionAdvantage = result['positionAdvantage']
            if len(positionAdvantage):
                if "，" or ',' in str(positionAdvantage):
                    positionAdvantage = positionAdvantage.replace('，', '、')
                    positionAdvantage = positionAdvantage.replace(',', '、')

            # 4. 写入csv文件内容
            csv_writer.writerow(
                [result['companyFullName'],
                 result['positionName'],
                 result['companyShortName'],
                 result['companySize'],
                 industryField,
                 result['financeStage'],
                 companyLabelLists,
                 result['firstType'],
                 result['secondType'],
                 result['thirdType'],
                 skillLables,
                 result['createTime'],
                 result['city'],
                 result['district'],
                 result['salary'],
                 result['workYear'],
                 result['education'],
                 positionAdvantage])


if __name__ == '__main__':
    main(10, 'python')
    # 5. 关闭文件
    f.close()