• 【Python网络编程】爬取百度贴吧、小说内容、豆瓣小说、Ajax爬微博、多线程爬淘宝


    一、爬取百度贴吧

    import re
    titleR ='<a rel="noreferrer" href=".*?" title=".*?" target="_blank" class="j_th_tit ">(.*?)</a>'
    authorR='<span class=".*?" title="主题作者:(.*?)" data-field'
    reduR ='<span class=".*?" title="回复">(.*?)</span>'
    with open('test.html','r',encoding='utf-8') as f:
        data = f.read()
        title = re.findall(titleR,data)
        author = re.findall(authorR,data)
        redu = re.findall(reduR,data)
    
    for i in range(0,len(author)) :
        print(redu[i]+author[i]+'   '+title[i]+'    ')
    

    二、提取小说内容

    from lxml import etree
    with open('work2.html','r') as f:
        text = f.read()
    html = etree.HTML(text)
    result = html.xpath('//*[@id="content"]/text()')
    with open('斗罗大陆.txt','w',encoding='utf-8') as f:
        f.write(''.join(result))
    print(result)
    

    三、豆瓣小说

    from lxml import etree
    with open('work3.html','r',encoding='utf-8') as f:
        text = f.read()
    html = etree.HTML(text)
    allInfo =''
    for i in range(1,25):
        title = html.xpath('//*[@id="content"]/div/div[1]/ol/li[%d]/div/div[2]/div[1]/a/span[1]/text()'%(i))
        score = html.xpath('//*[@id="content"]/div/div[1]/ol/li[%d]/div/div[2]/div[2]/div/span[2]/text()'%(i))
        comment = html.xpath('//*[@id="content"]/div/div[1]/ol/li[%d]/div/div[2]/div[2]/p[2]/span/text()'%(i))
        time = html.xpath('//*[@id="content"]/div/div[1]/ol/li[%d]/div/div[2]/div[2]/p[1]/text()[2]'%(i))
        info = ''.join(title)+' '+''.join(score)+' '+''.join(comment)+' '+''.join(time)+'
    '
        allInfo=allInfo+info
    with open('豆瓣电影.txt','w',encoding='utf-8') as f:
        f.write(allInfo)
    

    四、Ajax爬微博

    from urllib.parse import urlencode
    from pyquery import PyQuery as pq
    import requests
    base_url = 'https://m.weibo.cn/api/container/getIndex?'
    headers = {
        'Host': 'm.weibo.cn',
        'Referer': 'https://m.weibo.cn/u/2360812967',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest',
    }
    def get_page():
        params = {
            'uid':'2360812967',
            't': '0',
            'luicode': '10000011',
            'lfid': '100103type=1&amp;q=李现',
            'type': 'uid',
            'value': '2360812967',
            'containerid': '1076032360812967',
    
        }
        url = base_url + urlencode(params)
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.json()
        except requests.ConnectionError as e:
                    print('Error', e.args)
    
    def parse_page(json):
        if json:
            items = json.get('data').get('cards')
            i = 0;
            for item in items:
                if(i ==0):
                    i = 1
                    continue
    
                item = item.get('mblog')
                weibo = {}
                weibo['id'] = item.get('id')
                weibo['text'] = pq(item.get('text')).text()
                weibo['attitudes'] = item.get('attitudes_count')
                weibo['comments'] = item.get('comments_count')
                weibo['reposts'] = item.get('reposts_count')
                yield weibo
    
    
    if __name__ == '__main__':
        # result = get_page()
        # print(result)
        for page in range(1, 2):
            json = get_page()
            results = parse_page(json)
            for result in results:
                print(result)
    
    

    五、多线程爬淘宝

    from selenium import webdriver
    import time
    import threading
    
    
    def workthis(name):
        browser = webdriver.Chrome()
        browser.get('https://www.taobao.com')
        input = browser.find_element_by_id('q')
        input.send_keys(name)#向搜索框输输入值为iPhone
        time.sleep(1)#暂停1s为了模拟人的行为,防止被屏蔽
        button = browser.find_element_by_class_name('btn-search')
        button.click()#点击操作
        phone = browser.find_element_by_id('fm-login-id')
        phone.send_keys('18224393018')
        password = browser.find_element_by_id('fm-login-password')
        password.send_keys('***********')
        login = browser.find_element_by_xpath('//*[@id="login-form"]/div[4]/button')
        login.click()
        time.sleep(3)#暂停1s为了模拟人的行为,防止被屏蔽
        for i in range(1,48):
            price = browser.find_element_by_xpath(
                '//*[@id="mainsrp-itemlist"]/div/div/div[1]/div[%d]/div[2]/div[1]/div[1]/strong'%(i))
            title = browser.find_element_by_xpath(
                '//*[@id="mainsrp-itemlist"]/div/div/div[1]/div[%d]/div[2]/div[2]'%(i))
            print(title.text+'	'+price.text)
        browser.quit()
    if __name__ == '__main__':
        threading.Thread(target=workthis,args=('小米手机',)).start()
        threading.Thread(target=workthis,args=('苹果手机',)).start()
        threading.Thread(target=workthis,args=('华为手机',)).start()
    
    
  • 相关阅读:
    最近面试被问到的问题总结(数据研发方向)
    机器学习利器——Scikit-learn的安装
    编写shell脚本遇到的问题
    DBCP配置数据库连接乱码问题
    eclipse的maven项目报Missing artifact jdk.toos:jdk.toos:jar:1.6错
    [备忘]Windows Server 2008 R2部署FTP FileZilla Server防火墙设置
    Hive-0.x.x
    使用ganglia监控hadoop及hbase集群
    开源监控软件ganglia安装手册
    [Hadoop in Action] 第7章 细则手册
  • 原文地址:https://www.cnblogs.com/SiriusZHT/p/14310743.html
Copyright © 2020-2023  润新知