这是一个用于获取物理师会议报告的简单爬虫,数据库表结构正在不断完善中
爬虫信息:
1 # -*- coding:utf-8 -*- 2 import urllib.request 3 import pymysql 4 from bs4 import BeautifulSoup 5 import requests 6 import time 7 import re 8 import os 9 10 # 数据库连接基础类 11 class Conn_Mssql: 12 #查询Mysql数据库 13 def Select_mssql(strsql): 14 #数据库连接信息 15 conn = pymysql.connect("DESKTOP-V9MQNL6", "root", "password", "internetdaq", charset="utf8") 16 cur = conn.cursor() 17 cur.execute(strsql) 18 return cur 19 #插入与更新数据库 20 def InsertOrUpdate_mssql(strsql): 21 # 数据库连接信息 22 conn = pymysql.connect("DESKTOP-V9MQNL6", "root", "password", "internetdaq", charset="utf8") 23 cur = conn.cursor() 24 cur.execute(strsql) 25 conn.commit() 26 conn.close() 27 return cur 28 29 #获取网络信息中的信息,并存储 30 class Get_HttpMessage: 31 # 下载文件 32 def getFile(url): 33 try: 34 file_name = url.split('/')[-1] 35 file_path = "StorePDF\"+file_name 36 u = urllib.request.urlopen(url) 37 except : 38 print(url, "url file not found") 39 return 40 block_sz = 90192 41 with open(file_path, 'wb') as f: 42 while True: 43 buffer = u.read(block_sz) 44 if buffer: 45 f.write(buffer) 46 else: 47 break 48 print("Sucessful to download" + " " + file_name) 49 #开始获取网络信息 50 def startGet(): 51 print('start') 52 #链接的APPM网络 53 url = "https://www.aapm.org/pubs/reports/" 54 request = urllib.request.Request(url) 55 response = urllib.request.urlopen(request) 56 data = response.read() 57 soup = BeautifulSoup(data,"lxml") 58 #href属性包含docid字符串 59 for link in soup.find_all(href=re.compile("docid")): 60 #地址值 61 text_url = link['href'] 62 #地址名称 63 text_Name = link.get_text() 64 if len(text_url)>0 and len(text_Name)>10 : 65 strSQl = "insert into daqtest (SAVE_TIME,URL_Name,URL_Link) values (NOW(),'" + text_Name + "','" +url+ text_url + "')" 66 strSQl =strSQl.encode('utf8') 67 try: 68 #存储地址信息 69 Conn_Mssql.InsertOrUpdate_mssql(strSQl) 70 except: 71 print('母页面MySQL存储失败')
72 time.sleep(1) 73 #含有论文的网页地址 74 urlSecond = url + text_url 75 request2 = urllib.request.Request(urlSecond) 76 response2 = urllib.request.urlopen(request2) 77 data2 = response2.read() 78 soup2 = BeautifulSoup(data2, "lxml") 79 #此变量用于消除重复的PDF信息 80 pdfName = "" 81 #查询网页中的PDF信息 82 for link2 in soup2.find_all(href=re.compile("pdf")): 83 #PDF信息 84 text_url2 = link2['href'] 85 #PDF的所在网页来源 86 text_Name2 = url + text_url 87 if len(text_url2) > 0 and pdfName != text_url2: 88 pdfName = text_url2 89 strSQl2 = "insert into daqtest (SAVE_TIME,URL_Name,URL_Link) values (NOW(),'" + text_Name2 + "','" + text_url2 + "')" 90 strSQl2 = strSQl2.encode('utf8') 91 try: 92 #存储PDF信息至数据库 93 Conn_Mssql.InsertOrUpdate_mssql(strSQl2) 94 #慢一点,减缓网站压力 95 time.sleep(1) 96 #下载论文中的PDF文件 97 Get_HttpMessage.getFile(text_url2) 98 except: 99 print('子页面MySQL存储失败') 100 #程序入口 101 Get_HttpMessage.startGet()
这是用于存储的数据库表结构
1 /* 2 Navicat MySQL Data Transfer 3 4 Source Server : dde 5 Source Server Version : 50624 6 Source Host : DESKTOP-V9MQNL6:3306 7 Source Database : internetdaq 8 9 Target Server Type : MYSQL 10 Target Server Version : 50624 11 File Encoding : 65001 12 13 14 */ 15 16 SET FOREIGN_KEY_CHECKS=0; 17 18 -- ---------------------------- 19 -- Table structure for daqtest 20 -- ---------------------------- 21 DROP TABLE IF EXISTS `daqtest`; 22 CREATE TABLE `daqtest` ( 23 `ID` bigint(20) NOT NULL AUTO_INCREMENT, 24 `SAVE_TIME` datetime DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP, 25 `URL_Name` varchar(600) COLLATE utf8_unicode_ci DEFAULT NULL, 26 `URL_Link` varchar(6000) COLLATE utf8_unicode_ci DEFAULT NULL, 27 PRIMARY KEY (`ID`) 28 ) ENGINE=InnoDB AUTO_INCREMENT=4634 DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;