BeautifulSoup模块爬图学习HTML文本解析标签定位
网上教程多是爬mzitu,此网站反爬限制多了。随意找了个网址,解析速度有些慢。
脚本流程:首页获取总页数-->拼接每页URL-->获取每页中所有主题URL-->遍历图片源URL下载,保存
1 #python3
2 #coding:utf-8_
3 #_author: Jack
4 #_date: 2020/3/28
5
6 from bs4 import BeautifulSoup
7 import requests,os,sys,time
8
9 DIR_PATH = os.path.dirname(os.path.abspath(__file__))
10 sys.path.append(DIR_PATH)
11
12
13 HEADER = {
14 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:74.0) Gecko/20100101 Firefox/74.0',
15 }
16
17 def create_dir(file_path):
18 '''
19 :param file_path: images_directory
20 :return:
21 '''
22 if not os.path.exists(file_path):
23 os.makedirs(file_path)
24 print('Creatr directory:',file_path)
25 os.chdir(file_path) # cd ..
26
27 def save_data(src,dir_name,file_name):
28 '''
29 :param src: images url
30 :param sum: directory name
31 :param file_name: image name
32 :return:
33 '''
34 file_path = os.path.join(DIR_PATH,'images',str(dir_name)) #directory path
35 image_path = os.path.join(file_path,file_name) #images path
36 create_dir(file_path)
37
38 if not os.path.isfile(image_path):
39 req = requests.get(src,headers=HEADER)
40 with open(image_path, 'wb') as f_save:
41 f_save.write(req.content)
42 print('Download successful:',file_name)
43 f_save.flush()
44 else:
45 print('File already exists! Pass')
46
47 def request_to_url(url,header):
48 '''
49 :param url: page_url
50 :param head: request.header
51 :return: respond.text
52 '''
53 res = requests.get(url,headers=header)
54 return res.text
55
56 def soup(url,header):
57 '''
58 :param url:
59 :param header:
60 :return: HTML_Tag
61 '''
62 return BeautifulSoup(request_to_url(url,header),'html.parser')
63
64 def action(url):
65 '''
66 Download a count of 100 images and create a new folder
67 :param url: URL
68 :return:
69 '''
70 download_count = 0
71 dir_name =100
72 try:
73 page_tag = soup(url,HEADER).find('div',class_='pg').find_all('a')
74 max_page = int(page_tag[-2].text.split(' ')[-1])
75
76 for i in range(1,max_page+1): #find page
77 page_url = os.path.join(url,'forum.php?order=&fid=0&page=%d'%i)
78 #time.sleep(1)
79 page_all_theme_list = soup(page_url,HEADER).find('div',class_='kind_show')
80 theme_list = page_all_theme_list.find_all('div', class_='photo_thumb kind_left')
81
82 for i in theme_list: #find theme
83 theme = i.find('div', class_='title').find('a')
84 #title = theme.string
85 img_url = theme.get('href')
86 print("Ready download: %s" % theme.string,img_url)
87 # time.sleep(1)
88 img_page_tag = soup(img_url,HEADER).find('td',class_='t_f').find_all('img')
89
90 for i in img_page_tag: #find image
91 try:
92 img_src = i.get('src')
93 if download_count %100 == 0:
94 dir_name +=100
95 save_data(img_src,dir_name,img_src.split('/')[-1])
96 download_count += 1
97 print('Download successful: %d' %download_count)
98
99 except Exception as e:
100 print('Img_tag & Save_data Error:',e)
101 continue
102
103 except Exception as e:
104 print('The trunk Error:',e)
105
106 if __name__ == '__main__':
107 print('Run.....')
108 URL = 'http://www.lesb.cc/'
109 action(URL)
110 print('Perform !')