爬虫项目汇总

不登录下获取数据

 1 # coding=utf-8
 2 """
 3 用类封装爬虫任务，
 4 目的，获取豆瓣某地区安热度排列的全部电影
 5 思路：
 6     chorme分析目标url，
 7     构建url
 8     发请求获取数据
 9     保存数据
10     循环上三步直到最后一页
11 注意：目前代码中的url地址已经失效
12 """
13 import requests
14 import json
15 
16 class DoubanSpider:
17     def __init__(self):
18         self.url_temp_list = [
19             {
20                 "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_american_hot/items?start={}&count=18&loc_id=108288",
21                 "country": "US"
22             },
23             {
24                 "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_english_hot/items?start={}&count=18&loc_id=108288",
25                 "country": "UK"
26             },
27             {
28                 "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_domestic_hot/items?start={}&count=18&loc_id=108288",
29                 "country": "CN"
30             }
31         ]
32         self.headers = {
33             "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36",
34             "Referer": "https://m.douban.com/movie/"
35         }
36 
37     def parse_url(self, url):  # 发送请求，获取响应
38         print(url)
39         response = requests.get(url, headers=self.headers)
40         return response.content.decode()
41 
42     def get_content_list(self, json_str):  # 提取数据
43         dict_ret = json.loads(json_str)
44         content_list = dict_ret["subject_collection_items"]
45         total = dict_ret["total"]  # 代表总数量  不一定正确
46         return content_list, total
47 
48     def save_content_list(self, content_list,country):  # 保存
49         with open("douban.txt", "a", encoding="utf-8") as f:
50             for content in content_list:
51                 content["country"] = country
52                 f.write(json.dumps(content, ensure_ascii=False))
53                 f.write("
")  # 写入换行符，进行换行
54         print("保存成功")
55 
56     def run(self):  # 实现主要逻辑
57         for url_temp in self.url_temp_list:
58             num = 0 # num是url中的start参数，表示起始页
59             total = 100  # 假设有第一页
60             while num < total + 18: # 不能等于，因为等于意味着上一次已经把最后一页取完了
61                 # 1.start_url
62                 url = url_temp["url_temp"].format(num)
63                 # 2.发送请求，获取响应
64                 json_str = self.parse_url(url)
65                 # 3.提取是数据
66                 content_list, total = self.get_content_list(json_str)
67 
68                 # 4.每一页都保存一下，而不是全部获取后再保存，防止中间出问题了，前面获取的都白费了。
69                 self.save_content_list(content_list,url_temp["country"])
70                 # if len(content_list)<18: # 这种方式判断是否取到尾也可以
71                 #     break
72                 # 5.构造下一页的url地址,进入循环
73                 num += 18
74 
75 
76 if __name__ == '__main__':
77     douban_spider = DoubanSpider()
78     douban_spider.run()

01.豆瓣获取最热电影信息

 1 # coding=utf-8
 2 import requests
 3 from retrying import retry
 4 
 5 headers = {
 6     "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",
 7 }
 8 
 9 
10 @retry(stop_max_attempt_number=3)
11 def _parse_url(url, method, data, proxies):
12     print("*" * 20)
13     if method == "POST":
14         response = requests.post(url, data=data, headers=headers, proxies=proxies)
15     else:
16         response = requests.get(url, headers=headers, timeout=3, proxies=proxies)
17     assert response.status_code == 200
18     return response.content.decode()
19 
20 
21 def parse_url(url, method="GET", data=None, proxies={}):
22     try:
23         html_str = _parse_url(url, method, data, proxies)
24     except Exception as e:
25         html_str = None
26 
27     return html_str
28 
29 
30 if __name__ == '__main__':
31     url = "www.baidu.com"
32     print(parse_url(url))

02-1.通用贴吧爬虫-parse_url.py

 1 from parse_url import parse_url
 2 from lxml import etree
 3 import json
 4 """
 5 爬取任意贴吧 中帖子列表中的题目，评论数等，以及么个帖子中详情中每一页的图片，保存在文件中
 6 爬取时间：2019/3
 7 """
 8 
 9 class TieBa:
10     def __init__(self, name):
11         self.name = name
12         self.start_url = f"https://tieba.baidu.com/f?kw={name}&ie=utf-8&pn=0"
13         self.root_url = "https://tieba.baidu.com"
14 
15     def etree_get_content(self, text):
16         html = etree.HTML(text)
17         li_list = html.xpath("//li[@class=' j_thread_list clearfix']")  # 获取内容分组
18         data = []
19         for i in li_list:
20             # print(etree.tostring(i).decode())
21             item = {}
22             item["title"] = i.xpath(".//div[@class='threadlist_title pull_left j_th_tit ']/a/text()")[0] if i.xpath(
23                 ".//div[@class='threadlist_title pull_left j_th_tit ']/a/text()") else None
24             item["comments"] = i.xpath(".//span[@class='threadlist_rep_num center_text']/text()")[0] if i.xpath(
25                 ".//span[@class='threadlist_rep_num center_text']/text()") else None
26             item["href"] = self.root_url + i.xpath(".//div[@class='threadlist_title pull_left j_th_tit ']/a/@href")[
27                 0] if i.xpath(
28                 ".//div[@class='threadlist_title pull_left j_th_tit ']/a/@href") else None
29             item["imgs"] = self.get_img_list(item["href"], [])
30             data.append(item)
31         next_url = html.xpath(".//a[text()='下一页>']/@href")[0] if html.xpath(".//a[text()='下一页>']/@href") else None
32         return data, "https:" + next_url
33 
34     def pre_html(self, text):
35         """
36         去除注释符号
37         :return:
38         """
39         text = text.replace('<!--', '')
40         return text.replace('--!>', '')
41 
42     def get_img_list(self, next_url, container):
43         """递归爬取每一页贴吧详情页中发帖的图片"""
44         if next_url is None:
45             return container
46         detail_content = parse_url(next_url)
47         # 提取该页内容,提取下一页
48         html = etree.HTML(detail_content)
49         img_list = html.xpath("//img[@class='BDE_Image']/@src")  # 获取图片src 列表
50         container.extend(img_list)
51         next_url = html.xpath(".//a[text()='下一页']/@href")  # 下一页href
52         if next_url:  # 不为空列表
53             # 这里return写不写都行，因为container是列表，可变类型，下一步return container最后返回也行，但是建议写上，这样除了最后一次循环外，每层都少走一步
54             return self.get_img_list(self.root_url + next_url[0], container)
55         return container
56 
57     def save_content_dict(self, data):
58         file_path = self.name + ".txt"
59         with open(file_path, 'a+', encoding='utf8') as f:
60             for dd in data:
61                 f.write(json.dumps(dd, ensure_ascii=False))
62                 f.write('
')
63 
64     def run(self):
65         # 第一页
66         next_url = self.start_url
67         # 循环获取每一页
68         while next_url is not None:
69             html_str = parse_url(next_url)
70             # 请求信息预处理
71             html_str = self.pre_html(html_str)
72             # 提取该页内容,提取下一页
73             data, next_url = self.etree_get_content(html_str)
74             print(data)
75             self.save_content_dict(data)
76 
77 
78 if __name__ == '__main__':
79     name = input("请输入贴吧名称：").strip()
80     ba = TieBa(name)
81     ba.run()

02-2.通用贴吧爬虫-main.py

 1 """
 2 抓取https://www.qiushibaike.com所有热门 中数据
 3 爬取时间：2019/4
 4 """
 5 from parse_url import parse_url
 6 from lxml import etree
 7 import json
 8 
 9 
10 class QiuShi:
11 
12     def __init__(self):
13         self.start_url = "https://www.qiushibaike.com/8hr/page/{}/"  # 根据规律构建全部url地址
14         self.part_url = "https://www.qiushibaike.com"
15 
16     def etree_get_content(self, text):
17         etree_elemnt = etree.HTML(text)
18         # 先分组
19         content_list = etree_elemnt.xpath("//div[@class='recommend-article']/ul/li")
20         data = []
21         for li in content_list:
22             item = {}
23             try:
24                 item['title'] = li.xpath(".//a[@class='recmd-content']/text()")[0] if li.xpath(".//a[@class='recmd-content']/text()") else None
25                 item['href'] = self.part_url + li.xpath(".//a[@class='recmd-content']/@href")[0] if li.xpath(".//a[@class='recmd-content']/@href") else None
26                 item['laugh_num'] = li.xpath(".//div[@class='recmd-num']/span[1]/text()")[0] if li.xpath(".//div[@class='recmd-num']/span[4]/text()") else None
27                 item['comment_num'] = li.xpath(".//div[@class='recmd-num']/span[4]/text()")[0] if li.xpath(".//div[@class='recmd-num']/span[4]/text()") else None
28             except Exception as e:
29                 print(e)
30                 continue
31             data.append(item)
32         return data
33 
34     def save_content_dict(self, data):
35         file_path = "糗事百科热门.txt"
36         with open(file_path, 'a+', encoding='utf8') as f:
37             for dd in data:
38                 f.write(json.dumps(dd, ensure_ascii=False))
39                 f.write('
')
40 
41     def run(self):
42         # 构建url地址列表
43         for i in range(1, 14):
44             # 获取每一页目标响应
45             html_str = parse_url(self.start_url.format(i))
46             # 解析页面
47             data = self.etree_get_content(html_str)
48             # 每一页保存一次
49             self.save_content_dict(data)
50 
51 
52 if __name__ == '__main__':
53     q = QiuShi()
54     q.run()

03.爬取qiushibaike-单线程

 1 """
 2 多线程 抓取https://www.qiushibaike.com所有热门 中数据，
 3 但是该网站布局已经改版了，部分xpath解析的位置已经不在了
 4 爬取时间：2017/10
 5 """
 6 import requests
 7 from lxml import etree
 8 import threading
 9 from queue import Queue
10 
11 
12 class QiubaiSpdier:
13     def __init__(self):
14         self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"
15         self.headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
16         self.url_queue = Queue()
17         self.html_queue  = Queue()
18         self.content_queue = Queue()
19 
20     def get_url_list(self):
21         # return [self.url_temp.format(i) for i in range(1,14)]
22         for i in range(1,4):
23             self.url_queue.put(self.url_temp.format(i))
24 
25     def parse_url(self):
26         while True:
27             url = self.url_queue.get()
28             print(url)
29             response = requests.get(url,headers=self.headers)
30             self.html_queue.put(response.content.decode())
31             self.url_queue.task_done() # 注意必须url的get并处理好url的响应put到对应的队列后，再调用url的task_done使计数减一
32 
33     def get_content_list(self): # 提取数据
34         while True:
35             html_str = self.html_queue.get()
36 
37             html = etree.HTML(html_str)
38             div_list = html.xpath("//div[@id='content-left']/div")  #分组
39             content_list = []
40             for div in div_list:
41                 item= {}
42                 item["content"] = div.xpath(".//div[@class='content']/span/text()")
43                 item["content"] = [i.replace("
","") for i in item["content"]]
44                 item["author_gender"] = div.xpath(".//div[contains(@class,'articleGender')]/@class")
45                 item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon","") if len(item["author_gender"])>0 else None
46                 item["auhtor_age"] = div.xpath(".//div[contains(@class,'articleGender')]/text()")
47                 item["auhtor_age"] = item["auhtor_age"][0] if len(item["auhtor_age"])>0 else None
48                 item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src")
49                 item["content_img"] = "https:"+item["content_img"][0] if len(item["content_img"])>0 else None
50                 item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")
51                 item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"])>0 else None
52                 content_list.append(item)
53             self.content_queue.put(content_list)
54             self.html_queue.task_done() # 注意task_done放在put后面，确保get的结果处理完并且已经put都对应的队列中
55 
56     def save_content_list(self): # 保存
57         while True:
58             content_list = self.content_queue.get()
59             for i in content_list:
60                 # print(i)
61                 pass
62             self.content_queue.task_done()
63 
64     def run(self): #实现主要逻辑
65         thread_list = []
66         #1.url_list
67         t_url = threading.Thread(target=self.get_url_list)
68         thread_list.append(t_url)
69         #2.遍历，发送请求，获取响应
70         for i in range(20):
71             t_parse = threading.Thread(target=self.parse_url)
72             thread_list.append(t_parse)
73         #3.提取数据
74         for i in range(2):
75             t_html = threading.Thread(target=self.get_content_list)
76             thread_list.append(t_html)
77         #4.保存
78         t_save = threading.Thread(target=self.save_content_list)
79         thread_list.append(t_save)
80         for t in thread_list:
81             t.setDaemon(True) # 把子线程设置为守护线程，主线程结束，子线程结束
82             t.start()
83 
84         for q in [self.url_queue,self.html_queue,self.content_queue]:
85             # 调用此方法让主线程阻塞，直到队列中所有的项目均被处理。阻塞将持续到队列中的每个项目均调用q.task_done（）方法为止
86             q.join()
87 
88 if __name__ == '__main__':
89     qiubai = QiubaiSpdier()
90     qiubai.run()
91     print("主线程结束")

03.爬取qiushibaike-多线程

 1 """
 2 第一页：
 3 Request URL: https://www.douyu.com/directory/all
 4 Request Method: GET
 5 
 6 
 7 第二页：
 8 Request URL: https://www.douyu.com/gapi/rkc/directory/0_0/2
 9 Request Method: GET
10 
11 爬取时间：2019/4
12 """
13 import json
14 import time
15 from retrying import retry
16 from selenium import webdriver
17 from selenium.webdriver.chrome.options import Options
18 
19 chrome_options = Options()
20 chrome_options.add_argument('--headless')  # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
21 
22 class DouYuSpider:
23     def __init__(self):
24         self.start_url = 'https://www.douyu.com/directory/all'
25         self.driver = webdriver.Chrome(chrome_options=chrome_options)
26 
27     def save_content_dict(self, data):
28         file_path = 'douyu-room' + ".txt"
29         with open(file_path, 'a+', encoding='utf8') as f:
30             for dd in data:
31                 f.write(json.dumps(dd, ensure_ascii=False))
32                 f.write('
')
33             f.flush()
34 
35     @retry(stop_max_attempt_number=3)
36     def get_next_page_click(self):
37         next_page = self.driver.find_elements_by_xpath("//li[@class=' dy-Pagination-next']/span")
38         # 最后一页的 下一页 父元素 class=‘dy-Pagination-disabled dy-Pagination-next’ 表示不可点击了
39         if len(next_page) == 0:
40             return -1
41         else:
42             next_page[0].click()
43 
44     def get_single_page(self):
45         # 先分组
46         room_list = self.driver.find_elements_by_xpath(
47             "//div[@class='layout-Module-container layout-Cover ListContent']/ul/li")
48         data = []
49         for room in room_list:
50             item = {}
51             item['title'] = room.find_element_by_xpath(".//h3[@class='DyListCover-intro']").text
52             item['zone'] = room.find_element_by_xpath(".//span[@class='DyListCover-zone']").text
53             # item['img'] = room.find_element_by_xpath(".//img[@class='DyImg-content is-normal']").get_attribute(
54             #     'src')
55             item['anchor_name'] = room.find_element_by_xpath(".//h2[@class='DyListCover-user']").text
56             data.append(item)
57         return data
58 
59 
60     def run(self):
61         # 第一页
62         self.driver.get(self.start_url)
63         self.driver.implicitly_wait(12)
64 
65         while True:
66             # 获取每一页的页面结构化数据
67             data = self.get_single_page()
68             # 保存数据
69             self.save_content_dict(data)
70             # 查找下一页url,并点击
71             try:
72                 ret = self.get_next_page_click()
73                 time.sleep(2) # 等待页面加载完全
74                 if ret == -1:
75                     break
76             except Exception as e:
77                 print(e)
78 
79         self.driver.quit()
80 
81 
82 if __name__ == '__main__':
83     douyu = DouYuSpider()
84     douyu.run()
85 
86 """
87 优化建议：
88 1.把每一页的self.driver.page_source 页面字符串传给lxml的etree去处理
89 2.staleness_of 尝试失败 https://www.mlln.cn/2018/05/22/python-selenium如何在点击后等待页面刷新
90 
91 """

04.爬取斗鱼直播房间信息

自动登录案例

 1 """
 2 套路：登录首页的时候，已经给浏览器设置cookies，此时未激活
 3 登录成功后返回假的cookies，激活未激活的cookies，
 4 
 5 """
 6 import requests
 7 from bs4 import BeautifulSoup
 8 
 9 headers = {
10     "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
11 }
12 
13 index = requests.get("https://dig.chouti.com/", headers=headers)
14 cookies = index.cookies.get_dict()
15 
16 
17 # ===========================点赞=================
18 
19 # 1.登录
20 login = requests.post(
21     "https://dig.chouti.com/login",
22     data={
23         "phone": 8615026809593,
24         "password":'dajiahaa',
25     },
26     headers=headers,
27     cookies=cookies)
28 
29 # 2.点赞
30 dizan = requests.post(
31     url="https://dig.chouti.com/link/vote?linksId=25389911",
32     cookies=cookies,
33     headers=headers)
34 
35 print(dizan.text)

01.抽屉网

 1 """
 2 套路：
 3 - 带请求头
 4 - 带cookie
 5 - 请求体中：
 6     commit:Sign in
 7     utf8:✓
 8     authenticity_token:放在页面隐藏表单中
 9     login:asdfasdfasdf
10     password:woshiniba8
11 
12 """
13 import requests
14 from bs4 import BeautifulSoup
15 
16 headers = {
17     "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
18 }
19 
20 login = requests.get(
21     "https://github.com/login",
22     headers=headers,
23 )
24 cookies = login.cookies.get_dict()
25 login_par = BeautifulSoup(login.content, 'html.parser')
26 token_input = login_par.find(name='input', attrs={"name": "authenticity_token"})
27 
28 authenticity_token = token_input.attrs.get("value")
29 # 1.登录
30 re_login = requests.post(
31     "https://github.com/session",
32     data={
33         "commit": "Sign in",
34         "utf8":"✓",
35         "login": "cpcp@163.com",
36         "password": 'cs11187',
37         "authenticity_token": authenticity_token,
38         "webauthn-support": "supported"
39     },
40     cookies=cookies,
41     headers={
42         "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
43         "Referer": "https://github.com/login"
44     }
45 )
46 
47 print(re_login.text)

02.github

相关阅读:
为什么 PCB 生产时推荐出 Gerber 给工厂？
Fedora Redhat Centos 有什么区别和关系？
【KiCad】如何给元件给元件的管脚加上划线？
MCU ADC 进入 PD 模式后出现错误的值？
FastAdmin 生产环境升级注意
 EMC EMI 自行评估记录
 如何让你的 KiCad 在缩放时不眩晕？
KiCad 5.1.0 正式版终于发布
 一次单片机 SFR 页引发的“事故”
java基础之集合
原文地址：https://www.cnblogs.com/carlous/p/10624842.html