• requests+xpath+map爬取百度贴吧


     1 # requests+xpath+map爬取百度贴吧
     2 # 目标内容:跟帖用户名,跟帖内容,跟帖时间
     3 # 分解:
     4 # requests获取网页
     5 # xpath提取内容
     6 # map实现多线程爬虫
     7 import requests
     8 from requests.exceptions import RequestException
     9 from lxml import etree
    10 import json
    11 from multiprocessing.dummy import Pool as ThreadPool
    12 
    13 def get_html(url):
    14     try:
    15         response = requests.get(url)
    16         if response.status_code == 200:
    17             return response.text
    18         else:
    19             return None
    20     except RequestException:
    21         return None
    22 
    23 def parse_html(html):
    24     selector = etree.HTML(html)
    25     data = selector.xpath('//div[@class="l_post j_l_post l_post_bright  "]')
    26     for each in data:
    27         rs = each.xpath('@data-field')[0]
    28         rs = json.loads(rs)
    29         author = rs.get('author').get('user_name')
    30         author_id = rs.get('content').get('post_id')
    31         content = each.xpath('div/div/cc/div[@id="post_content_%s"]/text()'% author_id)[0].strip()
    32         date = rs.get('content').get('date')
    33         yield {
    34             'author':author,
    35             'content':content,
    36             'date':date
    37         }
    38 
    39 def save_to_txt(result):
    40     print('正在存储:',result)
    41 
    42     with open('tieba.txt','a',encoding='utf-8') as f:
    43         f.write('回帖作者:'+result['author']+'
    ')
    44         f.write('回帖内容:'+result['content']+'
    ')
    45         f.write('回帖时间:'+result['date']+'
    ')
    46         f.write('
    ')
    47 
    48 
    49 def main(url):
    50         html = get_html(url)
    51         if html:
    52             for result in parse_html(html):
    53                 save_to_txt(result)
    54 
    55 if __name__=='__main__':
    56     
    57     pool = ThreadPool(4)
    58     urls=[]
    59     base_url = 'http://tieba.baidu.com/p/3522395718?pn='
    60     for page_num in range(1, 21):
    61         url = base_url + str(page_num)
    62         urls.append(url)
    63 
    64     pool.map(main,urls)
    65     pool.close()
    66     pool.join()
  • 相关阅读:
    11 dubbo monitor
    svn ignore忽略文件
    Java Core Dump分析过程
    10 其他
    9 jenkins
    8 jenkins
    7 nhorizons项目结构
    6 发布个人项目到私服
    5 jenkins使用gitee
    4 Linux安装git
  • 原文地址:https://www.cnblogs.com/themost/p/7081713.html
Copyright © 2020-2023  润新知