• Python学习--猫眼电影TOP100榜单抓取


    import requests
    import re
    import json
    import time
    
    def get_one_page(url):
        headers={'User-Agent':'Mozilla/5.0(Macintosh;Intel Mac OS X 10_11_4) AppleWebKit/537.36(KHTML,like Geck) Chrome/52.0.2743.116 Safari/537.36'}
        response=requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    
    def parse_one_page(html):
        pattern = re.compile('<dd>.*?board-index.*?>(d+)</i>.*?data-src="(.*?)".*?name"><a'
                             + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                             + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
        items=re.findall(pattern,html)
        for item in items:
            yield {
                '排列序号': item[0],
                '图片': item[1],
                '电影名': item[2],
                '演员': item[3].strip()[3:],
                '时间': item[4].strip()[5:],
                '成绩': item[5] + item[6]
            }
    
    def write_to_file(content):
        with open('D://result.txt','a',encoding='utf-8') as f:
            #print(type(json.dumps(content)))
            f.write(json.dumps(content,ensure_ascii=False)+'
    ')
    
    def main(offset):
        url='http://maoyan.com/board/4?offset='+str(offset)
        html=get_one_page(url)
        #print(html)
        for item in parse_one_page(html):
            print(item)
            write_to_file(item)
    if __name__ == '__main__':
        for i in range(10):
            main(offset=i*10)
            time.sleep(1)
  • 相关阅读:
    The requested resource (/) is not available解决办法
    字符问题
    Unknown column in 'field list'
    table 和 div 简单布局
    css简介
    div 与 table 的优点
    瞎搞
    html
    小计--关联 复制表结构
    ddl dml dcl
  • 原文地址:https://www.cnblogs.com/Mayfly-nymph/p/10726778.html
Copyright © 2020-2023  润新知