#-*- coding:utf-8 -*- # 下载河南FDA各药品经营企业目录 import urllib.request import urllib.parse import re import os import http.cookiejar header = { 'Connection': 'Keep-Alive', 'Accept': 'application/x-ms-application, image/jpeg, application/xaml+xml, image/gif, image/pjpeg, application/x-ms-xbap, */*', 'Accept-Encoding': 'gzip, deflate', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', #'Referer':'http://hda.gov.cn/interplugin/face2/base.jsp', } def getOpener(): #自动设置COOKIER # deal with the Cookies print( '正在设置cookie') cj = http.cookiejar.CookieJar() pro = urllib.request.HTTPCookieProcessor(cj) opener = urllib.request.build_opener(pro, urllib.request.HTTPHandler) urllib.request.install_opener(opener) print( '设置cookie成功') return opener def download(content,pattern): m = re.compile(pattern) urls = re.findall(m,content) file_object = open('thefile.txt','a') i=0 for i,url in enumerate(urls): try: subid = url[0] suburl = "http://www.hda.gov.cn/interplugin/face2/content.jsp?tableId=13&tableName=TABLE13&tableView=%E8%8D%AF%E5%93%81%E9%9B%B6%E5%94%AE%E4%BC%81%E4%B8%9A&Id="+subid qymc = getContent(suburl,'企业名称.* .*83%>(.*)</td>','UTF-8') zcdz = getContent(suburl,'注册地址.* .*83%>(.*)</td>','UTF-8') xkzh = getContent(suburl,'许可证号.* .*83%>(.*)</td>','UTF-8') print(qymc,zcdz,xkzh) file_object = open('thefile.txt','a') file_object.write(qymc[0]) file_object.write(',') file_object.write(zcdz[0]) file_object.write(',') file_object.write(xkzh[0]) file_object.write(' ') finally: None file_object.close() print('i=',i) #opener = getOpener() def getContent(url,pat,charSet): #指定网址、正则表达式、编码方式,返回指定内容 page = urllib.request.urlopen(url) content = page.read().decode(charSet) pattern = re.compile(pat) result = re.findall(pattern,content) return result if __name__ == '__main__': file_object = open('thefile.txt','w') #1、读取首页的列表记录 url = "http://hda.gov.cn/interplugin/face2/base.jsp?tableId=13&tableName=TABLE13&title=%D2%A9%C6%B7%C1%E3%CA%DB%C6%F3%D2%B5&bcId=137264323448453682513826398962" request = urllib.request.Request(url, headers=header) page = urllib.request.urlopen(request) pageContent = page.read().decode('gb2312') #open('d:/py/test1.txt','w').write(pageContent) pattern = '&Id=(d{1,4})",null)>d{1,6}.(.*?)</a></p>' company_Name = download(pageContent,pattern) #2、读取第2-1183页的列表记录 for k in range(2,1183): url = 'http://www.hda.gov.cn/interplugin/face2/search.jsp?tableId=13&bcId=137264323448453682513826398962&curstart='+str(k) print(url) request = urllib.request.Request(url, headers=header) page = urllib.request.urlopen(request) pageContent = page.read().decode('UTF-8') pattern = "&Id=(d{1,4})',null)>d{1,6}.(.*?)</p>" company_Name = download(pageContent,pattern) print('药品经营企业名称下载完成!')
经过几天的摸索,终于可以下到想要的数据了;
路的的几个坑在此标下:
1、正则表达式中的换行符 (.*)匹配时,如果遇到换行,要加入' ’
2、调试时充分 利用 fiddler 和 python SHELL(方便粘贴)工具,即时调试;
未解决的问题:爬取的第一个页面中有重复数据,暂未找到如何处理;