• china-pub


    #!/usr/bin/env python                         
    #coding:utf-8
    import urllib2,re,sys,os,types                
    #from bs4 import BeautifulSoup                 

    reload(sys);
    sys.setdefaultencoding('gbk');              
            
    province="上海"
    city="上海"
    fileHeader='xEFxBBxBF'                     
    colums='省直辖市^城市^行政区^商圈^名称^地址^联系人^联系电话^URL^公司介绍^'

    def getCompany(method):
        for page in range(1,5+1):  
            url1="http://product.china-pub.com/cache/rank3/newbook/%s_%s.html"%(method,page)
            print " ##################:",url1
            httpCrawler(url1,page,method)

    def httpCrawler(url,page,method):
        content = httpRequest(url)
        #<tr logr='j_2_27359935228167_20019655228034_3'>
        List=re.findall(r'<td height="17" style="overflow: hidden;" colspan="5">(.*?)<a href="(.*?)" target="_blank">(.*?)</a>',content,re.S)
        no=len(List)
        print no
        method1=method.replace("/","")
        for i in range(0,no):#0 ~ no-1
        url=List[i][1]
        name=List[i][2]
        name1=name.replace("/","").replace(u"+微信营销与运营:策略、方法、技巧与实践+微信营销解密:移动互联网时代的营销革命","")
        print " download one page:",List[i][1]," ",List[i][2]
        if not os.path.exists('./%s'%method1):
            os.mkdir(r'./%s'%method1)
        content = httpRequest(url)
    #    if (page-1)*20+i+1 != 82:
            open(u'%s/%s.%s'%(method1,(page-1)*20+i+1,name1+'.html'),'w+').write(content)
        print "ok"

    def httpRequest(url):
        #try:
        html = None
        req_header = {
            'User-Agent':'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0',
            'Accept':'text/html;q=0.9,*/*;q=0.8',
            #'Accept-Language':'en-US,en;q=0.5',
            #'Accept-Encoding':'gzip',
            #'Host':'j3.s2.dpfile.com',
            #'Connection':'keep-alive'
            #'Referer':'http://www.baidu.com'
        }
        req_timeout = 15
        req = urllib2.Request(url,None,req_header)
        resp = urllib2.urlopen(req,None,req_timeout)
        html = resp.read()#.decode('gbk').encode('gbk')
        print "resp:",resp
        #print html
        #finally:
        #    if resp:
        #        resp.close()
        return html

    def writeHeader(fileheader,colums):
        if not os.path.exists('./58'):
            os.mkdir(r'./58')
        f = open('./58/daikuan.csv', 'w')
        f.write(fileheader)
        f.write(colums)
        #f.write(' ')
        f.close()

    array=(
    'day/rank_day_7_51',
    'day/rank_day_30_51',
    'day/rank_day_90_51',
    'month/rank_month_7_51',
    'month/rank_month_6_51',
    'month/rank_month_5_51',
    'month/rank_month_4_51',
    'month/rank_month_3_51',
    'month/rank_month_2_51',
    'month/rank_month_1_51',
    'day/rank_day_7_02',
    'day/rank_day_30_02',
    'day/rank_day_90_02',
    'month/rank_month_7_02',
    'month/rank_month_6_02',
    'month/rank_month_5_02',
    'month/rank_month_4_02',
    'month/rank_month_3_02',
    'month/rank_month_2_02',
    'month/rank_month_1_02',
    'day/rank_day_7_31',
    'day/rank_day_30_31',
    'day/rank_day_90_31',
    'month/rank_month_7_31',
    'month/rank_month_6_31',
    'month/rank_month_5_31',
    'month/rank_month_4_31',
    'month/rank_month_3_31',
    'month/rank_month_2_31',
    'month/rank_month_1_31',
    'day/rank_day_7_57',
    'day/rank_day_30_57',
    'day/rank_day_90_57',
    'month/rank_month_7_57',
    'month/rank_month_6_57',
    'month/rank_month_5_57',
    'month/rank_month_4_57',
    'month/rank_month_3_57',
    'month/rank_month_2_57',
    'month/rank_month_1_57',
    'day/rank_day_7_47',
    'day/rank_day_30_47',
    'day/rank_day_90_47',
    'month/rank_month_7_47',
    'month/rank_month_6_47',
    'month/rank_month_5_47',
    'month/rank_month_4_47',
    'month/rank_month_3_47',
    'month/rank_month_2_47',
    'month/rank_month_1_47',
    'day/rank_day_7_46',
    'day/rank_day_30_46',
    'day/rank_day_90_46',
    'month/rank_month_7_46',
    'month/rank_month_6_46',
    'month/rank_month_5_46',
    'month/rank_month_4_46',
    'month/rank_month_3_46',
    'month/rank_month_2_46',
    'month/rank_month_1_46',
    'day/rank_day_7_60',
    'day/rank_day_30_60',
    'day/rank_day_90_60',
    'month/rank_month_7_60',
    'month/rank_month_6_60',
    'month/rank_month_5_60',
    'month/rank_month_4_60',
    'month/rank_month_3_60',
    'month/rank_month_2_60',
    'month/rank_month_1_60',
    'day/rank_day_7_52',
    'day/rank_day_30_52',
    'day/rank_day_90_52',
    'month/rank_month_7_52',
    'month/rank_month_6_52',
    'month/rank_month_5_52',
    'month/rank_month_4_52',
    'month/rank_month_3_52',
    'month/rank_month_2_52',
    'month/rank_month_1_52',
    'day/rank_day_7_59',
    'day/rank_day_30_59',
    'day/rank_day_90_59',
    'month/rank_month_7_59',
    'month/rank_month_6_59',
    'month/rank_month_5_59',
    'month/rank_month_4_59',
    'month/rank_month_3_59',
    'month/rank_month_2_59',
    'month/rank_month_1_59'
    'day/rank_day_7_28',
    'day/rank_day_30_28',
    'day/rank_day_90_28',
    'month/rank_month_7_28',
    'month/rank_month_6_28',
    'month/rank_month_5_28',
    'month/rank_month_4_28',
    'month/rank_month_3_28',
    'month/rank_month_2_28',
    'month/rank_month_1_28',
    )



    if __name__ == '__main__':
        #writeHeader(fileHeader,colums)
        print len(array)
        for i in range(0,len(array)):
            getCompany(array[i])

  • 相关阅读:
    ApacheCN 所有教程/文档集已备份到 Coding
    固态硬盘寿命天梯榜 2021.7
    一个垃圾佬的自我修养(一)工作站篇
    Java 向上转型
    记一次chromedriver与浏览器版本不匹配导致的问题(mac版本)
    关于C# 里面的axWindowsMediaPlayer的使用
    WCHAR的相关操作 范例 , 同时也是产生创建Sqlserver语句新表的 Sql
    C++ Win32 sokcet2.0版本 TCP 服务器
    C++ WIN 32 socket 2.0版本 TCP客户端
    数据库和传感器糅合 数据部分程序 正常运行
  • 原文地址:https://www.cnblogs.com/timssd/p/4714665.html
Copyright © 2020-2023  润新知