• python解析网页


    果断使用BeautifulSoup!

    - -不想写了,挺简单的,贴个代码

     1 import urllib2
     2 import chardet
     3 from BeautifulSoup import BeautifulSoup
     4 import pymongo
     5 import time
     6 db = pymongo.Connection().notice #user notice
     7 
     8 def load_work():
     9     return db.work.find()
    10 def work_insert(data):
    11     db.work.insert(data)
    12 def update_state(idx,data):
    13     #print data
    14     db.state.update(idx,data,upsert=True)
    15 def to_notice(item):
    16     if(len(item) == 0):
    17         print 'No news'
    18     else:
    19         print item
    20     #pass
    21 def main_work():
    22     work = load_work()
    23     print '=====begin======'
    24     print time.ctime()
    25     for i in work:
    26         data = urllib2.urlopen(i['url']).read()
    27         charset = chardet.detect(data)['encoding'].lower()
    28         if charset == 'gb2312':
    29             charset = 'GBK'
    30         soup = BeautifulSoup(data , fromEncoding = charset)
    31         lst = soup.findAll('a')
    32         new_item = []
    33         url_set = []
    34         for item in lst:
    35             url = item.attrs[0][1]
    36             text = item.getText()
    37             url_set.append(url)
    38             if(db.state.find_one({'url':i['url'] , 'url_set':url}) == None):state.update({'url':url} , {'url':url} , upsert = True)
    39         if(len(new_item) > 0):
    40             #TODO notice
    41             to_notice(new_item)
    42             db.state.update({'url':i['url']} , {'url':i['url'] , 'url_set':url_set},upsert=True)
    43     print '=====end======'
    44 while True:
    45     main_work()
    46     time.sleep(60)
    by 1957
  • 相关阅读:
    Opencv3.4:显示一张图片
    Windows编译Opencv
    FFmpeg4.0笔记:rtsp2rtmp
    FFmpeg4.0笔记:file2rtmp
    Ubuntu编译安装crtmp-server
    python笔记:#014#综合应用
    python笔记:#012#函数
    Python学习--利用scapy库实现ARP欺骗
    metasploit——(三)渗透攻击之旅
    metasploit——(一)情报收集篇
  • 原文地址:https://www.cnblogs.com/x1957/p/3078783.html
Copyright © 2020-2023  润新知