刚刚学习python爬虫 拿一个小例子练练手。
同样的代码在Linux完美运行,但是在windows中就是各种编码错误。
因为编码问题搞得迷迷糊糊,所以只能用python3.X来操作了
代码如下:
# -*- coding:utf-8 -*- import requests from bs4 import BeautifulSoup import time title_list=[{'原创发布区':'http://www.52pojie.cn/forum-2-1.html'}, {'逆向资源区':'http://www.52pojie.cn/forum-4-1.html'}, {'脱壳破解区':'http://www.52pojie.cn/forum-5-1.html'}, {'动画发布区':'http://www.52pojie.cn/forum-6-1.html'}, {'悬赏问答区':'http://www.52pojie.cn/forum-8-1.html'}, {'水漫金山':'http://www.52pojie.cn/forum-10-1.html'}, {'站点公告':'http://www.52pojie.cn/forum-13-1.html'}, {'精品软件区':'http://www.52pojie.cn/forum-16-1.html'}, {'音乐视频':'http://www.52pojie.cn/forum-19-1.html'}, {'编程语言区':'http://www.52pojie.cn/forum-24-1.html'}, {'申请专区':'http://www.52pojie.cn/forum-25-1.html'}, {'LCG Area':'http://www.52pojie.cn/forum-28-1.html'}, {'病毒分析区':'http://www.52pojie.cn/forum-32-1.html'}, {'周年庆典活动专区':'https://www.52pojie.cn/forum-36-1.html'}, {'招聘求职':'http://www.52pojie.cn/forum-39-1.html'}, {'病毒样本区':'http://www.52pojie.cn/forum-40-1.html'}, {'安全工具区':'http://www.52pojie.cn/forum-41-1.html'}, {'电子书策划制作区':'http://www.52pojie.cn/forum-42-1.html'}, {'Key|Patch|共享账号':'http://www.52pojie.cn/forum-44-1.html'}, {'病毒救援区':'http://www.52pojie.cn/forum-50-1.html'}, {'影视推荐':'http://www.52pojie.cn/forum-56-1.html'}, {'LSG Area':'http://www.52pojie.cn/forum-58-1.html'}, {'软件调试区':'http://www.52pojie.cn/forum-59-1.html'}, {'T恤活动作品区':'http://www.52pojie.cn/forum-62-1.html'}, {'移动安全区':'http://www.52pojie.cn/forum-65-1.html'}, {'福利经验':'http://www.52pojie.cn/forum-66-1.html'}, {'2014CrackMe大赛':'http://www.52pojie.cn/forum-67-1.html'}, {'吾爱破解2016安全挑战赛':'http://www.52pojie.cn/forum-71-1.html'}, {'站务处理':'http://www.52pojie.cn/forum-72-1.html'}, ] def get_html(url): while True: try: response = requests.get(url) return response.text except Exception as e: time.sleep(10) continue # 得到区域总页数 def get_page(url): html = get_html(url) soup = BeautifulSoup(html,'lxml') label_list =soup.find_all('label') page = int(label_list[3].span.string[3:-2]) return page # 下载指定页面 def page_down(url): page = get_page(url) print("总页数:"+str(page)) txt = input("请输入保存到的文件名(注意添加后缀):") for j in range(1,page+1): print(("第"+str(j)+"页下载中").center(50,"=")) html = get_html(url[:-7]+'-'+str(j)+'.html') soup = BeautifulSoup(html,'lxml') label_list =soup.find_all('label') a_list =soup.find_all('a',attrs={'class':'s xst'}) #写入到文件 for a in a_list: #print(a.string) #print("https://www.52pojie.cn/"+a.attrs['href']) with open(txt,'a+',encoding='utf-8') as f: f.write(a.get_text()) f.write(' ') f.write("https://www.52pojie.cn/"+a.attrs['href']) f.write(' ') print(("第"+str(j)+"页下载完成").center(50,"■")) def main(): i = 0 time = 0 url = '' # 输出列表 for title in title_list: #print(title) for key in title: url = str(title[key]) if time==1: print((str(i)+':'+key).ljust(20)) time=0 else: print((str(i)+':'+key).ljust(20),end=" ") time+=1 i+=1 # 判断输入是否在范围内 while True: try: print() num = int(input('请输入你要浏览的代号:')) if num>28 or num<0: print('输入有误请重新输入') continue else: break except Exception as e: print('输入有误请重新输入') continue # 获得区域链接 dict_t = title_list[num] for key in dict_t: print(dict_t[key]) page_down(dict_t[key]) if __name__ == '__main__': main()