• python 编码问题


    __author__ = 'dell'
    # -*- coding: utf-8 -*-
    
    from lxml import etree
    import urllib2
    import time
    
    
    def loadCategory():
        res = {}
        f_txt = open('catetory.txt')
        while True:
            line = f_txt.readline()
            if not line:
                break
            line = line.strip().decode('gbk')
            tokens = line.split('	')
            if len(tokens) < 2:
                continue
            key = tokens[1].strip()
            print key
            val = tokens[0].strip()
            res[key] = val
        return res
    
    
    def loadCity():
        res = {}
        f_txt = open('city.txt')
        while True:
            line = f_txt.readline()
            if not line:
                break
            line = line.strip().decode('gbk')
            tokens = line.split(':')
            if len(tokens) < 2:
                continue
            key = tokens[0].strip()
            val = tokens[1].strip()
            if key in res.keys():
                print 'repeated city:', key
            else:
                res[key] = val
        return res
    
    
    cats = loadCategory()
    # for key in cats.keys():
    #     print key, cats[key]
    
    citys = loadCity()
    # for key in citys.keys():
    #     print key, citys[key]
    
    print 'length of category:', len(cats)
    print 'length of citys:', len(citys)
    
    print 'generating urls ... ...'
    
    standard = 'http://www.dianping.com/search/category/%s/%s'
    
    
    def gen(cateName):
        res = []
        if cateName in cats.keys():
            catId = cats[cateName]
            for cityName in citys.keys():
                cityId = citys[cityName]
                url = standard % (cityId, catId)
                res.append((url, cityName))
            return res
        else:
            return res
    
    
    def getHtml(url):
        request = urllib2.Request(url)
        request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:19.0) Gecko/20100101 Firefox/19.0')
        doc = urllib2.urlopen(request, timeout=45).read().decode('utf8')
        return doc
    
    
    def getFetchHour(count):
        return count * 5.0 / 3600
    
    
    def getFetchDay(count):
        return (count * 5.0 / 3600) / 24
    
    
    urllist = gen(u'购物')
    print len(urllist)
    sum = 0
    for u in urllist:
        html = getHtml(u[0])
        tree = etree.HTML(html)
        hnc = tree.xpath("//span[@class='Color7']")
        for hn in hnc:
            strnum = hn.text.replace('(', '').replace(')', '')
            print u[1], strnum
            sum += int(strnum)
        # time.sleep(5)
    
    print sum
    print 'fetch time (hour) :' + str(getFetchHour(sum))
    print 'fetch time (day) :' + str(getFetchDay(sum))
  • 相关阅读:
    如何用SQL命令修改字段名称
    两个sql server 2000的通用分页存储过程
    Tomcat 6 连接 MS SQL 2005
    log4net 配置与应用
    如何去除Google搜索结果病毒提示
    Windows 2003远程桌面连接数限制
    ntext replace sql
    FCKeditor详细的设置
    SQL Server 自增字段归零等问题
    SQLServer2005数据库还原到SQLServer2000
  • 原文地址:https://www.cnblogs.com/i80386/p/3421468.html
Copyright © 2020-2023  润新知