Python爬虫Day_4——lxml&xpath

  • 坚持早起(6点)第四天!!!

使用lxml解析HTML代码:

  1. 解析html字符串:使用lxml.etree.HTML()进行解析。实例代码如下:

    1
    2
    htmlElement = etree.HTML(text)
    print(etree.tostring(html.etreeing='utf-8').decode("utf-8"))
  2. 解析html文件:使用lxml.etree.parse()进行解析。示例代码如下:

    1
    2
    htmlelement = etree.parse("husky brief.html")
    print(etree.tostring(htmlelement, encoding='utf-8').decode('utf-8'))

    这个函数默认使用的是XML解析器,所以如果碰到一些不规范的HTML代码的时候就会解析错误,这时候就要自己创建 HTML解析器。示例代码如下:

    1
    2
    3
    paser = etree.HTMLParser(encoding='utf-8')
    htmlelement = etree.parse("husky brief.html", parser=paser)
    print(etree.tostring(htmlelement, encoding='utf-8').decode('utf-8'))
  3. Demo

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    from lxml import etree


    text = """
    <!DOCTYPE html>
    <html lang="zh">
    <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta http-equiv="X-UA-Compatible" content="ie=edge">
    <title>Husky Brief</title>
    <script src="https://cdn.bootcss.com/jquery/3.3.1/jquery.js"></script>
    </head>
    <body background="images/husky3.jpg">
    ...............................................................
    </body>
    </html>
    """


    def parse_text():
    htmlelement = etree.HTML(text)
    print(etree.tostring(htmlelement, encoding='utf-8').decode('utf-8'))


    def parse_file():
    # 默认xml解析器
    # HTML解析器
    paser = etree.HTMLParser(encoding='utf-8')
    htmlelement = etree.parse("husky brief.html", parser=paser)
    print(etree.tostring(htmlelement, encoding='utf-8').decode('utf-8'))


    if __name__ == '__main__':
    parse_file()

lxml结合xpath注意事项:

  1. 使用xpath语法,应该使用Element.xpath()方法,来执行xpath的选择。示例代码如下:

    1
    trs = html.xpath(”//tr[position()>1]")

    xpath()返回来的永远是一个列表。

  2. 获取某个标签的属性:

    1
    herf = html.xpath("//a/@href")
  3. 获取文本,是通过xpath中的text()函数,示例代码如下:

    1
    title = tr.xpath('./td/text()')[0]
  4. 在某个标签下,再执行xpath()函数,获取这个标签下的子孙元素,那么应该在//之前加一个.,代表是在当前元素下获取,示例代码如下:

    1
    title = tr.xpath('./td/text()')[0]
  5. Demo

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    from lxml import etree


    parser = etree.HTMLParser(encoding='utf-8')
    html = etree.parse('husky brief.html', parser=parser)

    # 获取所有tr标签
    # //tr
    # trs = html.xpath("//tr")
    # for tr in trs:
    # print(etree.tostring(tr, encoding='utf-8').decode('utf-8'))

    # 获取第二个tr标签
    # [0]取出元素
    # tr = html.xpath("//tr[2]")[0]
    # print(tr)
    # print(etree.tostring(tr, encoding='utf-8').decode('utf-8'))

    # 获取所有class等于even的tr
    # trs = html.xpath("//tr[@class='even']")
    # for tr in trs:
    # print(etree.tostring(tr, encoding='utf-8').decode('utf-8'))

    # 获取所有a标签的href属性
    # aLsit = html.xpath("//a/@href")
    # for a in aLsit:
    # print(a)

    # 获取信息
    # a=0
    trs = html.xpath('//tr')
    for tr in trs:
    # print(etree.tostring(tr, encoding='utf-8').decode('utf-8'))
    # a += 1
    # print(str(a))
    href = tr.xpath('.//a/@href')
    try:
    title = tr.xpath('./td/text()')[0]
    print(title)
    except:
    pass

爬取豆瓣网电影信息:

  1. Demo

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    import requests
    from lxml import etree


    # 1.将目标网站上的网页抓取下来
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.12'
    '1 Safari/537.36',
    'Referer': 'https://movie.douban.com/'
    }

    url = 'https://movie.douban.com/cinema/nowplaying/beijing/'

    response = requests.get(url, headers=headers)
    text = response.text

    # response.text,返回的是一个经过解码的字符串,是str(unicode)类型
    # response.content,返回的就是一个原生的字符串,就是从网页上抓取下来的,没有经过处理的字符串是bytes类型


    # 2.将抓取下来的数据根据一定的规则进行提取
    html = etree.HTML(text)
    ul = html.xpath("//ul[@class='lists']")[0]
    # print(etree.tostring(ul, encoding='utf-8').decode('utf8'))
    lis = ul.xpath("./li")
    movies = []
    for li in lis:
    # print(etree.tostring(li, encoding='utf-8').decode('utf8'))
    title = li.xpath("@data-title")[0]
    score = li.xpath("@data-score")[0]
    release = li.xpath("@data-release")[0]
    duration = li.xpath("@data-duration")[0]
    region = li.xpath("@data-region")[0]
    director = li.xpath("@data-director")[0]
    actors = li.xpath("@data-actors")[0]
    thumbnail = li.xpath("./ul/li[@class='poster']/a/img/@src")[0]
    movie = {
    'title': title,
    'score': score,
    'release': release,
    'duration': duration,
    'region': region,
    'director': director,
    'actors': actors,
    'thumbnail': thumbnail
    }
    movies.append(movie)
    print(movies)

爬取电影天堂网电影信息:

  1. Demo

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    import requests
    from lxml import etree


    BASS_DOMAN = 'https://www.dygod.net'

    HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
    '85.0.4183.121 Safari/537.36'
    }


    def main(pages):
    for page in range(1, pages+1):
    print('正在爬取第 ' + str(page) + " 页...")
    url = get_url(page)
    detail_urls = get_detail_urls(url)
    if page == pages:
    print('爬取完毕!!!')
    parse_detail_page(detail_urls)


    def get_url(page):
    if page == 1:
    url = 'https://www.dygod.net/html/gndy/dyzz/index.html'
    else:
    url = 'https://www.dygod.net/html/gndy/dyzz/index_' + str(page) + '.html'
    return url


    def get_detail_urls(url):
    response = requests.get(url, headers=HEADERS)
    text = response.content.decode('gbk')
    html = etree.HTML(text)
    detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
    # detail_urls = map(lambda lurl: BASS_DOMAN + lurl, detail_urls)
    index = 0
    for detail_url in detail_urls:
    detail_url = BASS_DOMAN + detail_url
    detail_urls[index] = detail_url
    index += 1
    return detail_urls


    def parse_detail_page(urls):
    for url in urls:
    response = requests.get(url, headers=HEADERS)
    text = response.content.decode('gbk')
    html = etree.HTML(text)
    title = html.xpath("//div[@id='Zoom']/text()")[2]
    title = title.replace('\u3000', '')
    region = html.xpath("//div[@id='Zoom']/text()")[4]
    region = region.replace('\u3000', '')
    category = html.xpath("//div[@id='Zoom']/text()")[5]
    category = category.replace('\u3000', '')
    language = html.xpath("//div[@id='Zoom']/text()")[6]
    language = language.replace('\u3000', '')
    release = html.xpath("//div[@id='Zoom']/text()")[8]
    release = release.replace('\u3000', '')
    douban = html.xpath("//div[@id='Zoom']/text()")[9]
    douban = douban.replace('\u3000', '').replace('/10', '')
    duration = html.xpath("//div[@id='Zoom']/text()")[14]
    duration = duration.replace('\u3000', '')
    director = html.xpath("//div[@id='Zoom']/text()")[15]
    director = director.replace('\u3000', '')
    thumbnail = html.xpath("//div[@id='Zoom']/img/@src")[0]
    thumbnail = thumbnail.replace('\u3000', '')
    movies = {
    'title': title,
    'region': region,
    'category': category,
    'language': language,
    'release': release,
    'douban': douban,
    'duration': duration,
    'director': director,
    'thumbnail': thumbnail
    }
    print(movies)


    if __name__ == '__main__':
    main(1)