• Python动态网站的抓取


    网页下载器

    # coding:utf-8
    import requests
    import urllib2
    import sys
    type = sys.getfilesystemencoding()
    class HtmlDownloader(object):

    def download(slef, url):

    if url is None:
    return None

    user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'

    headers = {'User-Agent': user_agent}
    req = urllib2.Request(url, headers=headers)
    response = urllib2.urlopen(req)
    if response.getcode() == 200:
    html = response.read()
    return html


    return None

    网页解析器

    # coding:utf-8
    import re
    import json
    class HtmlParser(object):

    def parser_url(self, page_url, response):

    pattern = re.compile(r'(http://movie.mtime.com/(d+)/)')
    urls = pattern.findall(response)
    if urls != None:
    # 将urls进行去重
    return list(set(urls))
    else:
    return None

    # 解析异步响应值
    def parser_json(self, page_url, response):

    # 将"="和";"之间的内容提取出来
    pattern = re.compile(r'=(.*?);')
    result = pattern.findall(response)[0]

    if result != None:
    value = json.loads(result)
    try:
    isRelease = value.get('value').get('isRelease')
    except Exception, e:
    print e
    return None
    if isRelease:
    if value.get('value').get('releaseType') == None:
    return self._parser_release(page_url, value)
    else:
    return self._parser_no_release(page_url, value, isRelease=2)
    else:

    return self._parser_no_release(page_url, value)

    def _parser_release(self, page_url, value):

    try:
    isRelease = 1
    movieRating = value.get('value').get('movieRating')
    boxOffice = value.get('value').get('boxOffice')
    moveTitle = value.get('value').get('moveTitle')
    RPictureFinal = movieRating.get('RPictureFinal')
    RStoryFinal = movieRating.get('RStoryFinal')
    RDirectorFinal = movieRating.get('RDirectorFinal')
    ROtherFinal = movieRating.get('ROtherFinal')
    RathingFinal = movieRating.get('RarhingFinal')

    MovieId = movieRating.get('MoviedId')
    Usercount = movieRating.get('Usercount')
    AttitudeCount = movieRating.get('AttitudeCount')

    TotalBoxOffice = boxOffice.get('TotalBoxOffice')
    TotalBoxOfficeUnit = boxOffice.get('TotalBoxOfficeUnit')
    TodayBoxOffice = boxOffice.get('TodayBoxOffice')
    TodayBoxOfficeUnit = boxOffice.get('TodayBoxOfficeUnit')

    ShowDays = boxOffice.get('ShowDays')

    try:

    Rank = boxOffice.get('Rank')
    except Exception, e:
    Rank = 0

    return (
    MovieId, moveTitle, RathingFinal, ROtherFinal, RPictureFinal, RDirectorFinal, RStoryFinal, Usercount,
    AttitudeCount
    , TotalBoxOffice + TotalBoxOfficeUnit, TodayBoxOffice + TodayBoxOfficeUnit, Rank, ShowDays, isRelease)
    except Exception, e:
    print e, page_url, value

    return None

    # 解析未上映的电影信息
    def _parser_no_release(self, page_url, value, isRelease=0):

    try:
    movieRating = value.get('value').get('movieRating')
    moveTitle = value.get('value').get('movieTitle')
    RPictureFinal = movieRating.get('RPictureFinal')
    RStoryFinal = movieRating.get('RStoryFinal')
    RDirectorFinal = movieRating.get('RDirectorFinal')
    ROtherFinal = movieRating.get('ROtherFinal')
    RatingFinal = movieRating.get('RatingFinal')

    MovieId = movieRating.get('MovieId')
    Usercount = movieRating.get('Usercount')
    AttitudeCount = movieRating.get('AttitudeCount')

    try:

    Rank = 0

    except Exception, e:
    Rank =0
    return (
    MovieId, moveTitle, RatingFinal, ROtherFinal, RPictureFinal, RDirectorFinal, RStoryFinal,
    Usercount,
    AttitudeCount
    , u'无', u'无', Rank, 0, isRelease)

    except Exception, e:

    print e, page_url, value

    return None

     数据存储器

    # coding:utf-8
    import MySQLdb


    class DataOutput(object):

    def __init__(self):
    self.con =MySQLdb.connect(host='127.0.0.1', user='root', passwd='', db='go',port=3306,charset='utf8')
    self.cx = self.con.cursor()
    self.create_table('MTime')
    self.datas = []

    def create_table(self, table_name):

    values = "id int(11) not null primary key auto_increment,"
    "MovieId int(11),"
    "MovieTitle varchar(40) NOT NULL,"
    "RatingFinal double NOT NULL DEFAULT 0.0,"
    "ROtherFinal double NOT NULL DEFAULT 0.0,"
    "RPictureFinal double NOT NULL DEFAULT 0.0,"
    "RDirectorFinal double NOT NULL DEFAULT 0.0,"
    "RStoryFinal double NOT NULL DEFAULT 0.0,"
    "Usercount int(11) NOT NULL DEFAULT 0,"
    "AttitudeCount int(11) NOT NULL DEFAULT 0,"
    "TotalBoxOffice varchar(20) NOT NULL,"
    "TodayBoxOffice varchar(20) NOT NULL,"
    "Rank int(11) NOT NULL DEFAULT 0,"
    "ShowDays int(11) NOT NULL DEFAULT 0,"
    "isRelease int(11) NOT NULL"
    ""
    #print 'CREATE TABLE IF NOT EXISTS %s(%s)' % (table_name, values)

    self.cx.execute('CREATE TABLE IF NOT EXISTS %s(%s) ENGINE=InnoDB DEFAULT CHARSET=utf8' % (table_name, values))

    def store_data(self, data):

    if data is None:
    return
    self.datas.append(data)
    if len(self.datas) > 10:
    self.output_db('MTime')

    def output_db(self, table_name):
    for data in self.datas:
    self.cx.execute("INSERT INTO MTime (MovieId,MovieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,"
    "RStoryFinal,Usercount,AttitudeCount,TotalBoxOffice,TodayBoxOffice,Rank,ShowDays,isRelease) "
    "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",data)

    self.datas.remove(data)

    self.con.commit()
    self.con.close()

    def output_end(self):

    if len(self.datas) > 0:
    self.output_db('MTime')

    self.cx.close()

     爬虫调度器

    # coding:utf-8
    from UrlManager import UrlManager
    from DataOutput import DataOutput
    from HtmlDownloader import HtmlDownloader
    from HtmlParser import HtmlParser
    import time
    class SpiderMan(object):

    def __init__(self):

    self.downloader = HtmlDownloader()
    self.parser = HtmlParser()
    self.output = DataOutput()

    def crawl(self,root_url):

    content = self.downloader.download(root_url)

    urls = self.parser.parser_url(root_url,content)


    for url in urls:

    try:
    t= time.strftime("%Y%m%d%H%M%S3282",time.localtime())
    rank_url ="http://service.library.mtime.com/Movie.api?"
    "Ajax_CallBack=true"
    "&Ajax_CallBackType=Mtime.Library.Services"
    "&Ajax_CallBackMethod=GetMovieOverviewRating"
    "&Ajax_CrossDomain=1"
    "&Ajax_RequestUrl=%s"
    "&t=%s"
    "&Ajax_CallBackArgument0=%s" %(url[0],t,url[1])

    #print rank_url
    #exit()
    rank_content = self.downloader.download(rank_url)

    data = self.parser.parser_json(rank_url,rank_content)

    self.output.store_data(data)
    except Exception,e:
    print e
    self.output.output_end()
    print "Crawl finish"

    if __name__ == '__main__':

    spider = SpiderMan()
    spider.crawl('http://theater.mtime.com/China_Beijing/')

  • 相关阅读:
    Mysql 主备原理
    Mysql-innodb日志写入时机
    Innodb 架构
    Reactor/Proactor
    select,poll,epoll,IO多路复用进化史
    从硬件+操作系统的角度解释为什么操作系统的IO单位是磁盘块
    Dubbo 核心功能在业务架构中的体现
    Mysql-Innodb 锁总结
    第一阶段冲刺三
    第一阶段冲刺二
  • 原文地址:https://www.cnblogs.com/paulversion/p/8393842.html
Copyright © 2020-2023  润新知