• python(初学提取html页面元素,借用老师)


     -*- coding: utf-8 -*-
    import urllib2
    
    import os
    
    def mean_audience_score(id):
        arv = 0.0
        sc_url = "http://movie.mtime.com/" + id + "/"
        sc_req = urllib2.Request(sc_url, headers={'User-Agent': "Magic Browser"})
        sc_page = urllib2.urlopen(sc_req)
        sc_strw = sc_page.read()
    
        sc_str = re.findall(r'<span class="db_point ml6">+d+.+d+</span>', sc_strw)
        if len(sc_str) == 0:
            return arv
        for tt in sc_str:
            scsc = re.findall(r'd+.+d', tt)
            arv += float(scsc[0])
        return arv / len(sc_str)
    
    url = 'http://theater.mtime.com/China_Anhui_Province_Wuhu/'
    req = urllib2.Request(url,headers={'User-Agent' : "Magic Browser"})
    webpage = urllib2.urlopen(req)
    strw = webpage.read()*0
    print strw
    tg_start = strw.find('hotplaySvList = [')
    print tg_start
    if tg_start == -1:
    	print 'not find start tag'
    	os._exit(0)
    tmp = strw[tg_start:-1]
    print tmp
    tg_end = tmp.find(';')
    print tg_end
    if tg_end == -1 :
        print 'not find end tag'
        os._exit(0)
    tmp = tmp[len('hotplaySvList = ['):tg_end]
    print tmp
    tar_ls = tmp.split("},{")
    dict_film = {}
    for t0 in tar_ls:
        ls_t = t0.split(',')
        id = ls_t[0].split(':')[-1].strip()
        film = ls_t[-1].split('"')[-2].strip()
        dict_film[id] = film
    for t in dict_film:
        print "id:" + t + "  film:" + dict_film[t]
    
  • 相关阅读:
    【转】JSch
    【转】JSch
    【转】class卸载、热替换和Tomcat的热部署的分析
    关于Tomcat自动加载更新class的小技巧
    MySQL中order by中关于NULL值的排序问题
    MySQL触发器使用详解
    QuartZ Cron表达式
    JDBC的URL设置allowMultiQueries的原因
    CRT:C运行库简介
    IntelliJ IDEA安装AngularJS插件
  • 原文地址:https://www.cnblogs.com/doublekai/p/6857778.html
Copyright © 2020-2023  润新知