• 练习4-今日头条爬取


    import requests
    from urllib.parse import urlencode
    from pyquery import PyQuery as pq
    import re,os
    from hashlib import md5
    
    def get_page(page_num,search_id):
        param1={
            'dvpf': 'pc',
            'source':'input',
            'keyword': '街拍'
        }
        param2 = {
            'keyword': '街拍',
            'pd': 'synthesis',
            'source': 'pagination',
            'dvpf': 'pc',
            'aid': 4916,
            'page_num': page_num,
            'search_id': search_id
        }
        if page_num == 0:
            param=param1
        else:
            param = param2
        url = 'https://so.toutiao.com/search?' + urlencode(param)
        try:
            reponse = requests.get(url)
            if reponse.status_code == 200:
                return reponse.text
        except Exception as e:
            print('ERROR1:', e)
    
    def parse_pg(html):
        doc=pq(html)
        imgs=doc('.abs-fill img').items()
        for img in imgs:
            src=img.attr('src')
            print(src)
            yield  src
    
    
    def save_img(img):
        if not os.path.exists(r'D:pycharm_projects街拍'):
            os.mkdir(r'D:pycharm_projects街拍')
        try:
            response=requests.get(img)
            if response.status_code ==200:
                file_path='{}/{}.{}'.format(r'D:pycharm_projects街拍',md5(response.content).hexdigest(),'jpg')
                if not os.path.exists(file_path):
                    with open(file_path,'wb') as f:
                        f.write(response.content)
                else:
                    print('alredy download')
        except Exception as e:
            print('ERROR2:',e)
    
    
    def main():
        search_id=''
        for i in range(2):
            if i == 0:
                html = get_page(i, search_id)
                doc = pq(html)
                search_id = re.search(r'search_id=(.*)&?',doc('.result-content:last-child a:first-child').attr('href')).group(1)
            else:
                html = get_page(i, search_id)
            imgs=parse_pg(html)
            for img in imgs:
                print(img)
                save_img(img)
    
    if __name__ == '__main__':
        main()
    
    
  • 相关阅读:
    深入浅出HTTP协议
    HTTP协议 URL
    HTTP Request
    HTTP 响应
    浅谈OpenStack架构
    Google云平台技术架构
    接口文档神器之apidoc
    .haccess 配置生效
    php namespace与use
    mysql 带条件的sum/count 使用技巧
  • 原文地址:https://www.cnblogs.com/tingshu/p/14773354.html
Copyright © 2020-2023  润新知