• 正规函数编写、调用示例猫眼抓取


    import requests,re,json,time
    from requests.exceptions import RequestException
    headers={
        'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        }
    def get_one_page(url):
        r=requests.get(url,headers=headers)
        if r.status_code==200:
            return r.text
        return None
    
    def parse_one_page(html):
        pattern = re.compile(
            '<dd>.*?board-index.*?>(d+)</i>.*?data-src="(.*?)".*?name"><a'
                             + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                             + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
        items=re.findall(pattern,html)
        
        for item in items:
            yield{
                'index': item[0],
                'image': item[1],
                'title': item[2].strip(),
                'actor': item[3].strip()[3:] if len(item[3]) > 3 else '',
                'time': item[4].strip()[5:] if len(item[4]) > 5 else '',
                'score': item[5].strip() + item[6].strip()
                }
    def write_to_file(content):
        with open('result.txt','a') as f:
            f.write(json.dumps(content,ensure_ascii=False)+'
    ')
    
    def main(offset):
        url='https://maoyan.com/board/4?offset='+str(offset)
        html=get_one_page(url)
        for item in parse_one_page(html):
            print(item)
            write_to_file(item)
        
    if __name__=='__main__':
        for i in range(10):
            main(offset=i*10)
            time.sleep(1)
  • 相关阅读:
    tablespaces
    转 房价
    Duplicate a whole line in Vim
    Dubbo+JStorm
    replace all
    ORACLE 最后表数据更新的时间
    list reverse
    python IDE
    string 方法
    java JIT AOT
  • 原文地址:https://www.cnblogs.com/chenxi188/p/10523986.html
Copyright © 2020-2023  润新知