• Python获取会议部分的信息内容(不断完善中)


    这是一个用于获取物理师会议报告的简单爬虫,数据库表结构正在不断完善中

    爬虫信息:

      1 # -*- coding:utf-8 -*-
      2 import urllib.request
      3 import pymysql
      4 from bs4 import BeautifulSoup
      5 import requests
      6 import time
      7 import re
      8 import os
      9 
     10 # 数据库连接基础类
     11 class Conn_Mssql:
     12     #查询Mysql数据库
     13     def Select_mssql(strsql):
     14         #数据库连接信息
     15         conn = pymysql.connect("DESKTOP-V9MQNL6", "root", "password", "internetdaq", charset="utf8")
     16         cur = conn.cursor()
     17         cur.execute(strsql)
     18         return cur
     19     #插入与更新数据库
     20     def InsertOrUpdate_mssql(strsql):
     21         # 数据库连接信息
     22         conn = pymysql.connect("DESKTOP-V9MQNL6", "root", "password", "internetdaq", charset="utf8")
     23         cur = conn.cursor()
     24         cur.execute(strsql)
     25         conn.commit()
     26         conn.close()
     27         return cur
     28 
     29 #获取网络信息中的信息,并存储
     30 class Get_HttpMessage:
     31     # 下载文件
     32     def getFile(url):
     33         try:
     34             file_name = url.split('/')[-1]
     35             file_path = "StorePDF\"+file_name
     36             u = urllib.request.urlopen(url)
     37         except :
     38             print(url, "url file not found")
     39             return
     40         block_sz = 90192
     41         with open(file_path, 'wb') as f:
     42             while True:
     43                 buffer = u.read(block_sz)
     44                 if buffer:
     45                     f.write(buffer)
     46                 else:
     47                     break
     48         print("Sucessful to download" + " " + file_name)
     49     #开始获取网络信息
     50     def startGet():
     51         print('start')
     52         #链接的APPM网络
     53         url = "https://www.aapm.org/pubs/reports/"
     54         request = urllib.request.Request(url)
     55         response = urllib.request.urlopen(request)
     56         data = response.read()
     57         soup = BeautifulSoup(data,"lxml")
     58         #href属性包含docid字符串
     59         for link in soup.find_all(href=re.compile("docid")):
     60             #地址值
     61             text_url = link['href']
     62             #地址名称
     63             text_Name = link.get_text()
     64             if len(text_url)>0 and len(text_Name)>10 :
     65                 strSQl = "insert into daqtest (SAVE_TIME,URL_Name,URL_Link) values (NOW(),'" + text_Name + "','" +url+ text_url + "')"
     66                 strSQl =strSQl.encode('utf8')
     67                 try:
     68                     #存储地址信息
     69                     Conn_Mssql.InsertOrUpdate_mssql(strSQl)
     70                 except:
     71                     print('母页面MySQL存储失败')
                
    72 time.sleep(1) 73 #含有论文的网页地址 74 urlSecond = url + text_url 75 request2 = urllib.request.Request(urlSecond) 76 response2 = urllib.request.urlopen(request2) 77 data2 = response2.read() 78 soup2 = BeautifulSoup(data2, "lxml") 79 #此变量用于消除重复的PDF信息 80 pdfName = "" 81 #查询网页中的PDF信息 82 for link2 in soup2.find_all(href=re.compile("pdf")): 83 #PDF信息 84 text_url2 = link2['href'] 85 #PDF的所在网页来源 86 text_Name2 = url + text_url 87 if len(text_url2) > 0 and pdfName != text_url2: 88 pdfName = text_url2 89 strSQl2 = "insert into daqtest (SAVE_TIME,URL_Name,URL_Link) values (NOW(),'" + text_Name2 + "','" + text_url2 + "')" 90 strSQl2 = strSQl2.encode('utf8') 91 try: 92 #存储PDF信息至数据库 93 Conn_Mssql.InsertOrUpdate_mssql(strSQl2) 94 #慢一点,减缓网站压力 95 time.sleep(1) 96 #下载论文中的PDF文件 97 Get_HttpMessage.getFile(text_url2) 98 except: 99 print('子页面MySQL存储失败') 100 #程序入口 101 Get_HttpMessage.startGet()

    这是用于存储的数据库表结构

     1 /*
     2 Navicat MySQL Data Transfer
     3 
     4 Source Server         : dde
     5 Source Server Version : 50624
     6 Source Host           : DESKTOP-V9MQNL6:3306
     7 Source Database       : internetdaq
     8 
     9 Target Server Type    : MYSQL
    10 Target Server Version : 50624
    11 File Encoding         : 65001
    12 
    13 
    14 */
    15 
    16 SET FOREIGN_KEY_CHECKS=0;
    17 
    18 -- ----------------------------
    19 -- Table structure for daqtest
    20 -- ----------------------------
    21 DROP TABLE IF EXISTS `daqtest`;
    22 CREATE TABLE `daqtest` (
    23   `ID` bigint(20) NOT NULL AUTO_INCREMENT,
    24   `SAVE_TIME` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,
    25   `URL_Name` varchar(600) COLLATE utf8_unicode_ci DEFAULT NULL,
    26   `URL_Link` varchar(6000) COLLATE utf8_unicode_ci DEFAULT NULL,
    27   PRIMARY KEY (`ID`)
    28 ) ENGINE=InnoDB AUTO_INCREMENT=4634 DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
  • 相关阅读:
    服务器开启超线程
    redhat用kickstart.cfg自动安装后,挂载ISO镜像并从中拷贝文件
    关于网卡特性TSO、UFO、GSO、LRO、GRO
    Linux 网卡特性配置ethtool详解
    SecureCRT设置标签显示标题
    关闭SecureCRT的声音
    SecureCRT设置Vim显示颜色
    centos关闭swap分区
    制作CentOS7.6 自动安装ISO镜像光盘
    带阵列卡的机器打开磁盘cache
  • 原文地址:https://www.cnblogs.com/Qt-Chao/p/8324823.html
Copyright © 2020-2023  润新知