• request,正则爬虫


    
    # def division(a, b):
    #     # write your code here
    #     try:
    #       c = int((a + b)/(a - b))
    #       return('a={0},b={1},(a + b)/(a - b)={2}'.format(a,b,c))
    #     except ZeroDivisionError as e:
    #        return('a = {0}, b ={1}, the denominator of (a+b)/(a-b) cannot be zero'.format(a,b))
    
    # a=division(5,5)
    # print(a)
    
    
    """
    第四题答案
    判断闰年
    year = input('')
    year = int(year)
    if year % 4 == 0 and year % 100 != 0:
        print('is a leap year')
    elif year % 400 == 0:
        print('is a leap year')
    else:
        print('not a leap year')
    
    """
    
    # 第二题答案
    # argv = [1,2,3]
    
    # import sys
    
    # name = str(argv[1])
    # SMS_verification_code = str(argv[2])
    
    # # print "Hello, name! Your validation code is SMS_verification_code
    # # please keep it in secret."
    # print("Hello, {0}! Your validation code is {1}, please keep it in secret".format(name,SMS_verification_code))
    
    # 如果不能实现功能需求的代码
    # 标注下 才能在以后更好的去实现它
    
    # try:
    #     a = float(input('请输入被除数:'))
    #     b = float(input('请输入除数:'))
    #     c = a/b
    #     print('商为:',c)
    # except ZeroDivisionError:
    #     print('除数不能为0!')
    
    
    # 什么是爬虫 网页又有些什么
    # 学习一个知识的时候 你想过怎么去实现它么 还是只记得死记硬背
    
    
    # 爬虫初试 爬取搜狗的首页并保存到文件中
    # import requests
    # if __name__ == '__main__':
    #     #step1 指定url
    #     url = 'https://www.sogou.com/'
    #     #step2 发起请求
    #     #get方法会返回一个响应对象
    #     response = requests.get(url = url)
    #    # response = requests.get(url = url)
    #     #step3 获取响应数据,text返回的是字符串形式的响应数据
    #     page_text = response.text # 返回的是字符串的形式
    #     print(page_text)
    #     #step4 持久化存储
    #     with open('./sogou.html','w',encoding = 'utf-8') as fp: # 看来就算爬取到了网站,也未必可以加载出来
    #         fp.write(page_text)
    #     print('爬取数据结束!')
    
    
    # #UA伪装:让爬虫对应的请求载体身份标识伪装成某一款浏览器,躲过UA检测
    # import requests
    # if __name__ == '__main__':
    #     #UA伪装:将对应的User-Agent封装到一个字典中
    #     headers = {
    #     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    #     }
    #     # headers = {''}
    #     #step1 指定url query
    #     url = 'https://www.sogou.com/web'
    #     #处理url携带的参数 封装到字典中
    #     kw = input('Enter a word:')
    #     param ={
    #         'query':kw # query指定的参数,kw输入进去的参数;类似的query的查找可在网页上找
    #     }
    #     #step2 对指定的url发起请求,对应的url是携带参数的,并且处理过程中处理了参数
    #     response = requests.get(url = url,params = param,headers = headers)
    #     #step3
    #     page_text = response.text
    #     #step4
    #     fileName = kw + '.html'
    #     with open(fileName,'w',encoding ='utf-8') as fp:
    #         fp.write(page_text)
    #     print(fileName,'保存成功!!')
    
    
    
    #- post请求(携带了参数)
    # 响应数据是一组json数据
    # import requests
    # import json
    # if __name__ == '__main__':
    #     #step1 指定URL
    #     post_url = 'https://fanyi.baidu.com/sug' # 他们是怎么知道最后后缀的
        
    #     #step2 进行UA伪装
    #     headers = {
    #     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    #     }
        
    #     #step3 post请求参数处理(同get请求类似)
    #     word = input('Enter a word:
    ')
    #     data = {
    #         'kw':word
    #     }
        
    #     #step4 请求发送 url data headers; get请求:url parms headers
    #     response = requests.post(url = post_url,data = data,headers = headers)
        
    #     #step5 获取响应数据:json()方法返回的是obj  (如果确认响应数据是json类型-->通过Content-Type分辨,才可以直接用json方法)
    #     dict_obj = response.json()
    #     print(dict_obj)
        
    #     #step6 持久化存储
    #     fileName = word + '.json'
    #     fp = open(fileName,'w',encoding='utf-8')
    #     json.dump(dict_obj,fp = fp)# .dump是字典转化为json么  json.dump()用于将python对象转换为字符串并且写入文件
    #     # dump一般有3个参数 第一个json对象,第二个fp=文件名
    #     fp.close()
    #     print('Over!')
    
    #按照上面模仿背着写个post请求
    # 赶紧回忆 参数data dump 
    
    # import json
    # import requests
    
    
    # url = 'https://fanyi.baidu.com/sug'
    # headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'}
    # kw = input('请输入你想翻译的话:')
    # data = {
    #     'kw':kw
    # }
    
    # response = requests.post(headers=headers,data=data,url=url)
    # re_json = response.json()
    
    # fp = open(kw+'.json','w',encoding="utf-8")
    # json.dump(re_json,fp=fp) # indent自动排版, indent= True
    # # 因为json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False: , ensure_ascii=False
    # fp.close()
    
    
    
    # 爬取豆瓣电影
    # import requests
    # import json
    # if __name__ == '__main__':
    #     url = 'https://movie.douban.com/j/chart/top_list' # 这个表本身就是个空字符串 如何查找这些嵌在网站里的请求,通过链接的方式可以直接达到目的
    #     param = {
    #         'type':'24',
    #         'interval_id':'100:90', # 这个请求参数有点东西
    #         'action':'',
    #         'start':'0',	#从库中的第几部电影去取
    #         'limit':'20'	#一次取出的个数
    #     }
    #     headers = {
    #     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    #     }
    #     # print(url+param) # must be str, not dict 只有str才能与str相加
    #     response = requests.get(url = url,params = param,headers = headers)
    #     list_data = response.json()
    #     fp = open('./douban.json','w',encoding = 'utf-8')
    #     json.dump(list_data,fp = fp,ensure_ascii = False,indent=True)
    #     print('Over!')
    
    
    # 返回一个城市的肯德基饭店
    # import requests
    # import json
    
    # if __name__ == '__main__':
    #     post_url = 'https://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword' # 又是特定的url,是咋弄来的
    #     keyword = input('请输入要查询的城市:') # 景德镇的肯德基饭店都不返回
    
    #     data ={
    #         'cname': '',
    #         'pid': '',
    #         'keyword': keyword,
    #         'pageindex': '1',
    #         'pageSize': '10'
    #     }
    #     headers = {
    #     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    #     }
    #     response = requests.post(url = post_url, data = data, headers = headers)
    
    #     # 持久化存储
    #     # page_text = response.text
    #     # fileName = keyword + '.html'
    #     # with open(fileName, 'w', encoding= 'utf-8') as fp:
    #     #     fp.write(page_text)
    #     # print(fileName, 'Over!')
    
    #     # 直接打印出来
    #     page = response.json()
    #     for dict in page['Table1']: # 这个Table1参数又有啥用
    #         StoreName = dict['storeName']
    #         address = dict['addressDetail']
    #         print('StoreName:' + StoreName, 'address:' + address + '
    ')
    
    
    # 域名和id值拼接出一个完整的企业对应的详情页的url
    # id值可以从首页对应的 ajax 请求到的 json 串中获取
    # - url的域名都是一样的,只有携带的参数(id)不一样
    # 首页中对应的企业信息是通过 ajax 动态请求到的
    # import requests
    # import json
    
    # if __name__ == '__main__':
    #     headers = {
    #         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    #     }
    #     id_list = []  # 存储企业的id
    #     all_data_list = []  # 存储企业所有的详情数据
    #     # 批量获取不同企业的id值
    #     url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList' # 这个url又是咋找的
    #     # 参数的封装
    #     for page in range(1, 11):
    #         page = str(page)
    #         data = {
    #             'on': 'true',
    #             'page': page,
    #             'pageSize': '15',
    #             'productName': '',
    #             'conditionType': '1',
    #             'applyname': '',
    #             'applysn': '',
    #         }
    #     json_ids = requests.post(url=url, headers=headers, data=data).json()
    #     # 从 json_ids 字典中拿到 list 对应的 value 值,对 value 值列表进行遍历
    #     for dic in json_ids['list']:
    #         id_list.append(dic['ID'])
    #     # print(id_list,'
    ')
    
    #     # 获取企业详情数据,也是动态加载出来的,携带一个参数 id,其值可以通过前一步生成的 id列表提取
    #     post_url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'
    #     for id in id_list:
    #         data = {
    #         'id': id
    #         }
    
    #         json_detail = requests.post(url=post_url, data=data, headers=headers).json()
    #         #print(json_detail, '-------------END----------')
    #         all_data_list.append(json_detail )
    #         all_data_list.append('---------------------------------------------------------')
    
    
    #     # 持久化存储all_data_list
    #     fp = open('./allData.json', 'w', encoding='utf-8')
    #     json.dump(all_data_list, fp=fp, ensure_ascii=False, indent= True)  # indent 自动排版
    #     print('Over!')
    
    
    
    # 正则表达式爬取页面
    
    
    # 需求:爬取糗事百科中糗图板块下所有的糗图图片
    # '''<div class="thumb">
    # <a href="/article/124098472" target="_blank">
    # <img src="//pic.qiushibaike.com/system/pictures/12409/124098472/medium/HSN2WWN0TP1VUPNG.jpg" alt="糗事#124098472" class="illustration" width="100%" height="auto">
    # </a>
    # </div>'''
    # import re
    # import os
    # import requests
    
    # if __name__ == '__main__':
    #     # 创建一个文件夹,保存所有的图片
    #     if not os.path.exists('./qiutuLibs'):
    #         os.mkdir('./qiutuLibs')
    
    #     url = 'https://www.qiushibaike.com/imgrank/ '
    #     headers = {
    #         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    #     }
    #     # 使用通用爬虫对url对应的一整张页面进行爬取
    #     page_text = requests.get(url=url, headers=headers).text
    #     #print(page_text)
    
    #     #使用聚焦爬虫将页面中所有的糗图进行解析提取
    #     ex = '<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>' # div里面有2个匹配的对象 src=('.*?') alt=".*?"
    #     # 我觉得这个()里面才是它返回的对象,果然他会只会迭代出括号里的字符串,它()外还加了''
    #     img_src_list = re.findall(ex, page_text, re.S) # 搜搜字符串,以列表类型返回全部能匹配的子串
    #     # re.S使.匹配包括换行在内的所有字符    
    #     print(img_src_list) #findall只匹配对应的跟url相匹配的字符串 这个url不对劲啊
    #     for src in img_src_list:
    #         #拼接出完整的图片url
    #         src = 'https:' + src
    #         print('单个url的样子:')
    #         img_data = requests.get(url = src, headers = headers).content
    #         #生成图片名称
    #         img_name = src.split('/')[-1] # 这里应该对应的是中文名 split以固定的字符串分割字符串,并以索引进行提取
    #         imgPath = './qiutuLibs/' + img_name
    #         with open(imgPath, 'wb') as fp:
    #             fp.write(img_data)
    #         print(img_name, '下载成功!')
    
    # import os
    # import re
    # import requests
    # if not os.path.exists('./qiutuLibs'):
    #     os.mkdir('./qiutuLibs')
    
    # url = 'https://www.qiushibaike.com/imgrank/' # 'https://www.qiushibaike.com/imgrank/ '
    
    # headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'}
    
    # # 提取图片的url
    # '''<div class="thumb">
    # <a href="/article/124098472" target="_blank">
    # <img src="//pic.qiushibaike.com/system/pictures/12409/124098472/medium/HSN2WWN0TP1VUPNG.jpg" alt="糗事#124098472" class="illustration" width="100%" height="auto">
    # </a>
    # </div>'''
    # response = requests.get(url = url, headers = headers).text
    # print(response)
    # img_re = '<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>'# img不要写闭包>  # 正则匹配外面用小括号,里面的匹配项class = ""还有需匹配提取的
    # # 一定要用大括号 '<div class="thumb">.*?<img src= "(.*?)" alt=.?*></div>' 
    # # '<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>'
    # # 这个正则写好容易错,该不该加''
    # img_list = re.findall(img_re, response, re.S)  # img_list = re.findall(img_re, response, re.S) 这句话匹配为空
    # print(img_list)
    # # re.S用于匹配换行
    # print('这下面都不运行了')
    # for i in img_list:
    #     url_img = 'https:' + i # 这里没有加https:没加:
    #     img_content = requests.get(url=url_img,headers=headers).content
    #     img_name = i.split('/')[-1] # split分割字符串
    #     img_path = './qiutuLibs/' + img_name # 表示上一级目录的文件夹
    #     with open(img_path,'wb') as fp: # 加了encoding='utf-8' TypeError: write() argument must be str, not bytes
    #         # 要以二进制形式写入
    #         fp.write(img_content)
    #     print('下载成功')
    
    
    
    
    # 对上述代码进行进一步处理,使得能够分页爬取图片
    # import re
    # import os
    # import requests
    
    # if __name__ == '__main__':
    #     # 创建一个文件夹,保存所有的图片
    #     if not os.path.exists('./qiutuLibs'):
    #         os.mkdir('./qiutuLibs')
    #     # 设置一个通用的url模板
    #     url = 'https://www.qiushibaike.com/imgrank/page/%d/'
    #     for pageNum in range(1, 11):
    #         # 对应页码的 url
    #         new_url = format(url % pageNum)
    #         headers = {
    #         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    #         }
    #         # 使用通用爬虫对url对应的一整张页面进行爬取
    #         page_text = requests.get(url=new_url, headers=headers).text # 这个paga_text应该是多个多个文档的集合,那么它的类型是
    #         print(type(page_text)) # <class 'str'>
    
    #         #使用聚焦爬虫将页面中所有的糗图进行解析提取
    #         ex = '<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>'
    
    #         img_src_list = re.findall(ex, page_text, re.S)
    #         print(img_src_list)
    #         for src in img_src_list:
    #             #拼接出完整的图片url
    #             src = 'https:' + src
    #             img_data = requests.get(url = src, headers = headers).content
    #             #生成图片名称
    #             img_name = src.split('/')[-1]
    #             imgPath = './qiutuLibs/' + img_name
    #             with open(imgPath, 'wb') as fp:
    #                 fp.write(img_data)
    #             print(img_name, '下载成功!')
    
    
    
    
    
    
    
    
    
    努力拼搏吧,不要害怕,不要去规划,不要迷茫。但你一定要在路上一直的走下去,尽管可能停滞不前,但也要走。
  • 相关阅读:
    (webservice,ajax,jmail)Tip:动态调用webservice,ajaxpro的使用要点以及使用jmail接口使用注意事项
    (virus)Error Cleaner, Privacy Protector, Spyware&Malware Protection的罪恶
    (tips,javascript,office)客户端操作excel文档的注意事项
    (iis,asp.net)Tip:错误"由于 ASP.NET 进程标识对全局程序集缓存没有读权限,因此未能执行请求。错误: 0x80070005 拒绝访问"的解决办法
    (javascript)常用农历(12生肖年,天干地支,节气)
    (Life)质量和服务_由购买联想笔记本想到的
    (enjoyment,basketball)随便说说我们的NBA
    (javascript)再说document.body.scrollTop的使用问题
    IE7下使用htmleditor的问题
    POJ 1077 Eight(BFS + A* + 逆序对剪枝)
  • 原文地址:https://www.cnblogs.com/wkhzwmr/p/14965889.html
Copyright © 2020-2023  润新知