• 爬虫——三个小实战


    贴吧爬取

    写代码前,构思需要的功能块;写代码时,把各个功能模块名提前写好

    初始化

    初始化必要参数,完成基础设置 爬取百度贴吧lol吧:爬取地址中的get参数须传递(可以指定不同主题的贴吧和页码)

    • 主题名
    • 初始网址
    • 请求头

    生成网址

    生成每一页的路由地址

    • 根据列表生成式生成多个页面的地址

    下载

    get请求给每一页的地址,爬取页面

    保存

    保存爬取结果到文件中,把每一页爬取结果写入到对应名字的文件中

    控制流程

    将以上爬取操作封装到run函数中,方便外部对象调用,以后会在此添加多线程

    • 生成要爬取的每一页的路由地址
    • 通过for循环遍历每一个路由地址
    • 对每个路由地址进行爬取和获取页码操作,并进行保存

    源码

     1 import requests
     2 
     3 class TiebaSpider:
     4     def __init__(self, tieba_name_crawl):
     5         """
     6         初始化必要参数,完成基础设置
     7         爬取百度贴吧lol吧:爬取地址中的get参数须传递(可以指定不同主题的贴吧和页码)
     8         """
     9         self.tieba_name = tieba_name_crawl
    10         self.url_base = 'https://tieba.baidu.com/f?kw=' + tieba_name_crawl + '&ie=utf-8&pn={}'
    11         self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'}
    12 
    13     def make_url(self):
    14         """
    15         生成每一页的路由地址
    16         :return:(列表生成式)
    17         """
    18         return [self.url_base.format(i) for i in range(4)]
    19 
    20     def download_url(self, url_str):
    21         """
    22         get请求给每一页的地址,爬取页面
    23         :param url_str: 每一页的路由地址
    24         :return: 爬取的结果
    25         """
    26         result = requests.get(url_str, headers=self.headers)
    27         return result.text
    28 
    29     def save_result(self, result, page_num):
    30         """
    31         保存爬取结果到文件中
    32         :param result: 每一页的爬取结果
    33         :param page_num: 页码,方便分类保存
    34         :return: 把每一页爬取结果写入到对应名字的文件中
    35         """
    36         # with open('./download/lol' + str(page_num) + '.html', 'ab') as f:
    37         #     f.write(result.encode('utf-8'))
    38         file_path = './download/{}~第{}页.html'.format(self.tieba_name,page_num)
    39         with open(file_path,'wb') as f:
    40             f.write(result.encode('utf-8'))
    41 
    42     def run(self):
    43         """
    44         将以上爬取操作封装到run函数中,方便外部对象调用,以后会在此添加多线程
    45         · 生成要爬取的每一页的路由地址
    46         · 通过for循环遍历每一个路由地址
    47         · 对每个路由地址进行爬取和获取页码操作,并进行保存
    48         :return:
    49         """
    50         url_lists = self.make_url()
    51         for url_str in url_lists:
    52             result_str = self.download_url(url_str)
    53             p_num = url_lists.index(url_str) + 1
    54             self.save_result(result=result_str,page_num=p_num)
    55 
    56 if __name__ == '__main__':
    57     tieba_spider = TiebaSpider('lol')
    58     tieba_spider.run()

    爬取糗事百科

     1 import requests
     2 from bs4 import BeautifulSoup
     3 import lxml.html
     4 
     5 class QiushiSpider:
     6     def __init__(self):
     7         """
     8         初始化必要参数,完成基础设置
     9         """
    10         # self.tieba_name = qiushi_name_crawl
    11         # https: // www.qiushibaike.com / 8
    12         # hr / page / 2 /
    13         self.url_base = 'https://www.qiushibaike.com/8hr/page/{}/'
    14         # self.url_base = 'https://tieba.baidu.com/f?kw=' + qiushi_name_crawl + '&ie=utf-8&pn={}'
    15         self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'}
    16 
    17     def make_url(self):
    18         return [self.url_base.format(i) for i in range(4)]
    19 
    20     def download_url(self, url_str):
    21         result = requests.get(url_str, headers=self.headers)
    22         #----------
    23         # html = lxml.html.fromstring(result.text)
    24         # html_data = html.xpath('//div[@class="content"]/span[1]/text()')
    25         # data_all = []
    26         # # for h in html_data:
    27         # #     data_all.append(h)
    28         # return html_data
    29         #-----------
    30         return result.text
    31 
    32     def save_result(self, result, page_num):
    33         with open('./download/qiushi' + str(page_num) + '.html', 'ab') as f:
    34             f.write(result.encode('utf-8'))
    35 
    36 
    37 # qiushi = QiushiSpider()
    38 # qiushi_url = qiushi.make_url()
    39 # j = 1
    40 # for i in qiushi_url:
    41 #     qiushi_text = qiushi.download_url(url_str=i)
    42 #     qiushi.save_result(result=qiushi_text, page_num=j)
    43 #     j += 1

    爬取国家信息

    BeautifulSoup方式

     1 import requests
     2 from bs4 import BeautifulSoup
     3 class CountrySoup:
     4     def __init__(self,country_name):
     5         self.country_name = country_name
     6         self.url_base = 'http://example.webscraping.com/places/default/view/{}'.format(self.country_name)
     7         self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',}
     8 
     9     def download_url(self):
    10         result = requests.get(self.url_base,headers=self.headers)
    11         soup = BeautifulSoup(result.text,'lxml')
    12         tr = soup.find(attrs={'id':"places_country__row"})
    13         print(tr,type(tr))
    14         td = tr.find(attrs={'class':"w2p_fw"})
    15         print(td,type(td))
    16 
    17         print(td.text)

    lxml方式

     1 class CountrySpider:
     2     def __init__(self,country_name):
     3         self.country_name = country_name
     4         self.url_base = 'http://example.webscraping.com/places/default/view/{}'.format(self.country_name)
     5         self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',}
     6 
     7     def download_url(self,url_str):
     8         result = requests.get(url_str,headers=self.headers)
     9         html = lxml.html.fromstring(result.text)
    10         data_country = html.xpath('//tr[@id="places_country__row"]/td[@class="w2p_fw"]/text()')
    11         data_capital = html.xpath('//tr[@id="places_capital__row"]/td[@class="w2p_fw"]/text()')
    12         data_area = html.xpath('//tr[@id="places_area__row"]/td[@class="w2p_fw"]/text()')
    13         data_all = ['国家:'+data_country[0],'首都:'+data_capital[0],'国土面积:'+data_area[0]]
    14         return data_all
    15         # print(html_data)
    16 
    17     def save_result(self,result):
    18         print(type(result),result)
    19         for r in result:
    20             r = r + '
    '
    21             with open('./country.txt','ab') as f:
    22                 f.write(r.encode('utf-8'))
    23         # with open('./country.txt','ab') as f:
    24         #     f.writelines(result)
    25     def run(self):
    26         result = self.download_url(self.url_base)
    27         self.save_result(result)
    28 
    29 
    30 if __name__ == '__main__':
    31     # c = CountrySpider('Bolivia-27')
    32     # c.run()
    33     s = CountrySoup('Bolivia-27')
    34     s.download_url()
  • 相关阅读:
    51nod 1463 找朋友 (扫描线+线段树)
    51nod 1295 XOR key (可持久化Trie树)
    51nod 1494 选举拉票 (线段树+扫描线)
    51Nod 1199 Money out of Thin Air (树链剖分+线段树)
    51Nod 1287 加农炮 (线段树)
    51Nod 1175 区间中第K大的数 (可持久化线段树+离散)
    Codeforces Round #426 (Div. 1) B The Bakery (线段树+dp)
    前端基础了解
    git 教程
    HIVE 默认分隔符 以及linux系统中特殊字符的输入和查看方式
  • 原文地址:https://www.cnblogs.com/siplips/p/9688173.html
Copyright © 2020-2023  润新知