• 20170513爬取猫眼电影Top100


    import json
    import re
    import requests
    from bs4 import BeautifulSoup
    from requests import RequestException
    from multiprocessing import Pool
    def get_one_page(url):
    headers = {'User-Agent':'baiduspider+'}
    try:
    response = requests.get(url,headers=headers,timeout = 5)
    if response.status_code == 200:
    return response.text
    return None
    except RequestException:
    return None
    def parse_one_page(html):
    #pattern = re.compile('<dd>.*?board-index.*?>(d+)</i>.*?src="(.*?)".*?name"><a.*?(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)<i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
    #items = re.findall(pattern,html)
    for item in BeautifulSoup(html,'lxml').find_all('dd'):
    rank = item.select('i')[0].text
    name = item.select('p > a')[0].text
    star = item.select('.star')[0].text.strip()
    releasetime = item.select('.releasetime')[0].text
    integer = item.select('.integer')[0].text
    fraction = item.select('.fraction')[0].text
    grade = integer+fraction
    yield {
    'rank':rank,
    'name':name,
    'star':star,
    'releasetime':releasetime,
    'grade':grade
    }
    #print(rank,name,star,releasetime,'评分为:',integer+fraction)
    #return rank,name,star,releasetime,'评分为:',integer+fraction
    def write_to_file(content):
    with open('result.txt','a',encoding='utf-8') as f:
    f.write(json.dumps(content,ensure_ascii=False) + ' ')#将字典转化为字符串
    f.close()
    def main(offset):
    url = 'http://maoyan.com/board/4?offset=' + str(offset)
    html = get_one_page(url)
    parse_one_page(html)
    for item in parse_one_page(html):
    print(item)
    write_to_file(item)

    if __name__=="__main__":
    for i in range(10):
    main(i*10)
    #pool = Pool()
    #pool.map(main,[i*10 for i in range(10)])
  • 相关阅读:
    进程间通信小结
    菜鸡和菜猫进行了一场Py交易
    菜鸡开始接触一些基本的算法逆向了
    菜鸡学逆向学得头皮发麻,终于它拿到了一段源代码
    静态分析-Windows找密码
    逆向-完成地址随机化关闭
    QSortFilterProxyModel 的过滤 排序
    linux命令2
    linux 命令1
    error c2059 c3905 c2148 c2238
  • 原文地址:https://www.cnblogs.com/Jiang190/p/6849845.html
Copyright © 2020-2023  润新知