• 人民法院重大事件抓取


    时间:2017-8-3 23:30

    Url:http://www.court.gov.cn

    py3.4 + mysql + win7

    import urllib.request
    import re
    import pymysql
    from time import sleep
    try:
        con = pymysql.connect(host = '127.0.0.1',user = 'root',passwd='root')
        con.query('create database PeopleCourt')
        con = pymysql.connect(host = '127.0.0.1',user = 'root',passwd='root',db = 'PeopleCourt')
    except:
        con = pymysql.connect(host = '127.0.0.1',user = 'root',passwd='root',db = 'PeopleCourt')
    try:
        con.query('create TABLE lawcase(title char(100),url char(100),time char(50))')
    except:
        print('Table existed')
    
    url_row = 'http://www.court.gov.cn/fabu-gengduo-15.html?page=1'
    header = {'User-Agent':'Mozilla/5.0'}
    req = urllib.request.Request(url_row,headers=header)
    res = urllib.request.urlopen(req)
    data = res.read().decode()
    reg_page = re.compile('<li class="last"><a href="/fabu-gengduo-15.html?page=(.*?)">').findall(data)
    print('page:'+str(reg_page[0]))
    for page in range(1,int(reg_page[0])+1):
        print('Grab page:'+str(page))
        url = 'http://www.court.gov.cn/fabu-gengduo-15.html?page='+str(page)
        req = urllib.request.Request(url,headers=header)
        res = urllib.request.urlopen(req)
        data = res.read().decode()
        reg_item_string = '<a title="(.*?)" target="_blank" href="(.*?)">.*?</a>.*?<i class="date">(.*?)</i>'
        reg_item = re.compile(reg_item_string,re.S).findall(data)
        for item in reg_item:
            title = item[0].replace('
    ','')
            Url = 'http://www.court.gov.cn'+item[1]
            time = item[2]
            sql = "insert INTO lawcase(title,url,time) VALUES ('"+title+"','"+Url+"','"+time+"')"
            con.query(sql)
        sleep(2)
    print('Ok')

    数据库截图:

    天下飞羽,花落凡尘
  • 相关阅读:
    堆优化Dijkstra模版
    poj_1364King
    快速排序库函数qsort的使用
    CMD type命令
    开放地址法
    poj_3159Candies
    poj_1511Invitation Cards
    何谓数据结构
    div ul li添加文本自动自动
    java虚拟机使用内存
  • 原文地址:https://www.cnblogs.com/AngelYuFan/p/7282821.html
Copyright © 2020-2023  润新知