Python爬虫_lxml&xpath
Python爬虫Day_4——lxml&xpath
- 坚持早起(6点)第四天!!!
使用lxml解析HTML代码:
解析html字符串:使用
lxml.etree.HTML()
进行解析。实例代码如下:1
2htmlElement = etree.HTML(text)
print(etree.tostring(html.etreeing='utf-8').decode("utf-8"))解析html文件:使用
lxml.etree.parse()
进行解析。示例代码如下:1
2htmlelement = etree.parse("husky brief.html")
print(etree.tostring(htmlelement, encoding='utf-8').decode('utf-8'))这个函数默认使用的是
XML
解析器,所以如果碰到一些不规范的HTML
代码的时候就会解析错误,这时候就要自己创建HTML
解析器。示例代码如下:1
2
3paser = etree.HTMLParser(encoding='utf-8')
htmlelement = etree.parse("husky brief.html", parser=paser)
print(etree.tostring(htmlelement, encoding='utf-8').decode('utf-8'))Demo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35from lxml import etree
text = """
<!DOCTYPE html>
<html lang="zh">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<title>Husky Brief</title>
<script src="https://cdn.bootcss.com/jquery/3.3.1/jquery.js"></script>
</head>
<body background="images/husky3.jpg">
...............................................................
</body>
</html>
"""
def parse_text():
htmlelement = etree.HTML(text)
print(etree.tostring(htmlelement, encoding='utf-8').decode('utf-8'))
def parse_file():
# 默认xml解析器
# HTML解析器
paser = etree.HTMLParser(encoding='utf-8')
htmlelement = etree.parse("husky brief.html", parser=paser)
print(etree.tostring(htmlelement, encoding='utf-8').decode('utf-8'))
if __name__ == '__main__':
parse_file()
lxml结合xpath注意事项:
使用
xpath
语法,应该使用Element.xpath()
方法,来执行xpath的选择。示例代码如下:1
trs = html.xpath(”//tr[position()>1]")
xpath()
返回来的永远是一个列表。获取某个标签的属性:
1
herf = html.xpath("//a/@href")
获取文本,是通过
xpath
中的text()
函数,示例代码如下:1
title = tr.xpath('./td/text()')[0]
在某个标签下,再执行
xpath()
函数,获取这个标签下的子孙元素,那么应该在//
之前加一个.
,代表是在当前元素下获取,示例代码如下:1
title = tr.xpath('./td/text()')[0]
Demo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41from lxml import etree
parser = etree.HTMLParser(encoding='utf-8')
html = etree.parse('husky brief.html', parser=parser)
# 获取所有tr标签
# //tr
# trs = html.xpath("//tr")
# for tr in trs:
# print(etree.tostring(tr, encoding='utf-8').decode('utf-8'))
# 获取第二个tr标签
# [0]取出元素
# tr = html.xpath("//tr[2]")[0]
# print(tr)
# print(etree.tostring(tr, encoding='utf-8').decode('utf-8'))
# 获取所有class等于even的tr
# trs = html.xpath("//tr[@class='even']")
# for tr in trs:
# print(etree.tostring(tr, encoding='utf-8').decode('utf-8'))
# 获取所有a标签的href属性
# aLsit = html.xpath("//a/@href")
# for a in aLsit:
# print(a)
# 获取信息
# a=0
trs = html.xpath('//tr')
for tr in trs:
# print(etree.tostring(tr, encoding='utf-8').decode('utf-8'))
# a += 1
# print(str(a))
href = tr.xpath('.//a/@href')
try:
title = tr.xpath('./td/text()')[0]
print(title)
except:
pass
爬取豆瓣网电影信息:
Demo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48import requests
from lxml import etree
# 1.将目标网站上的网页抓取下来
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.12'
'1 Safari/537.36',
'Referer': 'https://movie.douban.com/'
}
url = 'https://movie.douban.com/cinema/nowplaying/beijing/'
response = requests.get(url, headers=headers)
text = response.text
# response.text,返回的是一个经过解码的字符串,是str(unicode)类型
# response.content,返回的就是一个原生的字符串,就是从网页上抓取下来的,没有经过处理的字符串是bytes类型
# 2.将抓取下来的数据根据一定的规则进行提取
html = etree.HTML(text)
ul = html.xpath("//ul[@class='lists']")[0]
# print(etree.tostring(ul, encoding='utf-8').decode('utf8'))
lis = ul.xpath("./li")
movies = []
for li in lis:
# print(etree.tostring(li, encoding='utf-8').decode('utf8'))
title = li.xpath("@data-title")[0]
score = li.xpath("@data-score")[0]
release = li.xpath("@data-release")[0]
duration = li.xpath("@data-duration")[0]
region = li.xpath("@data-region")[0]
director = li.xpath("@data-director")[0]
actors = li.xpath("@data-actors")[0]
thumbnail = li.xpath("./ul/li[@class='poster']/a/img/@src")[0]
movie = {
'title': title,
'score': score,
'release': release,
'duration': duration,
'region': region,
'director': director,
'actors': actors,
'thumbnail': thumbnail
}
movies.append(movie)
print(movies)
爬取电影天堂网电影信息:
Demo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83import requests
from lxml import etree
BASS_DOMAN = 'https://www.dygod.net'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/'
'85.0.4183.121 Safari/537.36'
}
def main(pages):
for page in range(1, pages+1):
print('正在爬取第 ' + str(page) + " 页...")
url = get_url(page)
detail_urls = get_detail_urls(url)
if page == pages:
print('爬取完毕!!!')
parse_detail_page(detail_urls)
def get_url(page):
if page == 1:
url = 'https://www.dygod.net/html/gndy/dyzz/index.html'
else:
url = 'https://www.dygod.net/html/gndy/dyzz/index_' + str(page) + '.html'
return url
def get_detail_urls(url):
response = requests.get(url, headers=HEADERS)
text = response.content.decode('gbk')
html = etree.HTML(text)
detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
# detail_urls = map(lambda lurl: BASS_DOMAN + lurl, detail_urls)
index = 0
for detail_url in detail_urls:
detail_url = BASS_DOMAN + detail_url
detail_urls[index] = detail_url
index += 1
return detail_urls
def parse_detail_page(urls):
for url in urls:
response = requests.get(url, headers=HEADERS)
text = response.content.decode('gbk')
html = etree.HTML(text)
title = html.xpath("//div[@id='Zoom']/text()")[2]
title = title.replace('\u3000', '')
region = html.xpath("//div[@id='Zoom']/text()")[4]
region = region.replace('\u3000', '')
category = html.xpath("//div[@id='Zoom']/text()")[5]
category = category.replace('\u3000', '')
language = html.xpath("//div[@id='Zoom']/text()")[6]
language = language.replace('\u3000', '')
release = html.xpath("//div[@id='Zoom']/text()")[8]
release = release.replace('\u3000', '')
douban = html.xpath("//div[@id='Zoom']/text()")[9]
douban = douban.replace('\u3000', '').replace('/10', '')
duration = html.xpath("//div[@id='Zoom']/text()")[14]
duration = duration.replace('\u3000', '')
director = html.xpath("//div[@id='Zoom']/text()")[15]
director = director.replace('\u3000', '')
thumbnail = html.xpath("//div[@id='Zoom']/img/@src")[0]
thumbnail = thumbnail.replace('\u3000', '')
movies = {
'title': title,
'region': region,
'category': category,
'language': language,
'release': release,
'douban': douban,
'duration': duration,
'director': director,
'thumbnail': thumbnail
}
print(movies)
if __name__ == '__main__':
main(1)
本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 WineMonk!
评论