main
from tkinter import * import time import tkinter as tk import PythonTaoBao_TaobaoTest as TB from tkinter import messagebox import pandas as pd from tkinter import scrolledtext from tkinter import PhotoImage from PIL import Image, ImageTk import os import csv root = Tk() root.geometry("600x700+550+100") root.title("爬虫小助手") #------------------ROOT---------------------------------------------- UpperBlock = Frame(root,width = 1600,height=50,relief=SUNKEN) UpperBlock.pack(side=TOP) TAB = Frame(root ,width = 400,height=700,relief=SUNKEN) TAB.pack(side=TOP) #------------------TIME信息-------------- localtime=time.asctime(time.localtime(time.time())) #-------------------顶部信息-------------------------------------- lblinfo = Label(UpperBlock, font=( 'Times New Roman' ,30, 'bold' ),text="爬虫小助手",fg="blue",bd=10,anchor='w') lblinfo.grid(row=0,column=0) lblinfo = Label(UpperBlock, font=( 'aria' ,20, ),text=localtime,fg="blue",anchor=W) lblinfo.grid(row=1,column=0) #--------------函数调用功能------------------------------ def buttonclick(numbers): if numbers==1 : messagebox.showinfo("事件1","现在执行的是事件1") elif numbers==2: messagebox.showinfo("事件2", "现在执行的是事件2") else : rt = tk.Toplevel() rt.title("测试") rt.geometry('300x300') l1 = Label(rt, text="输入爬取的关键字:") l1.pack() xls_text = StringVar() xls = Entry(rt, textvariable=xls_text) xls_text.set(" ") xls.pack() l2 = Label(rt, text="输入爬取的页数:") l2.pack() xls2_text = StringVar() xls2 = Entry(rt, textvariable=xls2_text) xls2_text.set(" ") xls2.pack() def on_click(): x = str(xls_text.get()) print(x) page= int(xls2_text.get())+1 data=TB.getData(TB.TaobaoUrl,x,page) TB.SaveData(x, data) TB.Hottopic(x) TB.PriceFigure(x) def on_click1(): s = x + '.csv' s2= x + '.txt' rt3 = Toplevel() rt3.title("展示界面") rt3.geometry("400x300+550+100") data = pd.read_csv(s, encoding='utf-8') with open(s2, 'a+', encoding='utf-8') as f: for line in data.values: f.write((str(line[0]) + ' ' + str(line[1]) + ' ' + str(line[2]) + ' ' + str(line[3]) + ' ')) f = open(s2, 'r', encoding='utf-8') sda = scrolledtext.ScrolledText(rt3, width=50, height=10, font=("宋体", 10)) sda.place(x=30, y=30) s = f.read() sda.insert(END, s) def on_click2(): rt4 = Toplevel() rt4.title("查询关键字界面") rt4.geometry("600x800+550+100") ph = x + '.png' img_open = Image.open(ph) global img_png img_png = ImageTk.PhotoImage(img_open) label_img = Label(rt4, image=img_png) label_img.place(x=0,y=0) def on_click3(): rt5 = Toplevel() rt5.title("价格分布展示界面") rt5.geometry("600x800+550+100") ph = x + '价格分布.png' img_open = Image.open(ph) global img_png1 img_png1 = ImageTk.PhotoImage(img_open) label_img = Label(rt5, image=img_png1) label_img.place(x=0,y=0) rt2 = tk.Toplevel() rt2.title("调用成功") rt2.geometry('300x300') Button(rt2, text="查询内容", command=on_click1).pack() Button(rt2, text="查询关键字", command=on_click2).pack() Button(rt2, text="查询价格分布", command=on_click3).pack() # messagebox.showinfo(title='调用成功', message=x+' 调用成功') Button(rt, text="press", command=on_click).pack() #-----------一些测试信息------------------ button1=Button(TAB,padx=16,pady=16,bd=7, fg="black", font=('Times New Roman', 20 ,'bold'),text="爬取淘宝",bg="orange", command=lambda: buttonclick(1) ) button1.grid(row=2,column=0) button2=Button(TAB,padx=16,pady=16,bd=7, fg="black", font=('Times New Roman', 20 ,'bold'),text="爬取豆瓣",bg="orange", command=lambda: buttonclick(2) ) button2.grid(row=2,column=1) button2=Button(TAB,padx=16,pady=16,bd=7, fg="black", font=('Times New Roman', 20 ,'bold'),text="爬取测试",bg="orange", command=lambda: buttonclick(3) ) button2.grid(row=2,column=3) #----------关于EXIT的功能-------------------- def qexit(): root.destroy() buttonexit=Button(TAB,padx=16,pady=8, bd=10 ,fg="black",font=('Times New Roman' ,16,'bold'),width=10, text="EXIT", bg="orange",command=qexit) buttonexit.grid(row=8, column=3) #-----------Main Function------------------ root.mainloop()
function_Zhu Jiu
from bs4 import BeautifulSoup import re import urllib.request,urllib.error from urllib.parse import urlencode import xlwt import sqlite3 import jieba import operator import pandas from wordcloud import WordCloud from matplotlib import pyplot as plt FileName="TaoBao.txt" TaobaoUrl='https://s.taobao.com/search?' FindTitle = re.compile(r'"raw_title":"(.*?)"') FindPrice = re.compile(r'"view_price":"(.*?)"') FindComment = re.compile(r'"comment_count":"(.*?)"') FindSale = re.compile(r'"view_sales":"(.*?)"') FindNid = re.compile(r'"nid":"(.*?)"') def askURL(url): head = { "accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "cookie":"t=32c7a85cd15aa831b8ec0dc54623814b; cna=DG+ZFyYN83MCAW8Ci6iqyWAe; xlly_s=1; miid=2028550422116889343; lLtC1_=1; _m_h5_tk=c829cf68e876d8c46a690fa1bd32dceb_1603189177067; _m_h5_tk_enc=786725f712559720af4e547a1185869a; sgcookie=E100v3lUlY2Mn9qHcIOEWSWHoxmJEQFA84URgVCAvFhUhCztc60kkVLRNTHY0kh5Q1Gi6ra%2BVmZfTginF7icJwPfhQ%3D%3D; uc3=lg2=UIHiLt3xD8xYTw%3D%3D&id2=UUBYgrfF9R5FYg%3D%3D&nk2=F5QbQQuNek%2FZK1s%3D&vt3=F8dCufHGEaKtervENj4%3D; lgc=tbn60165318; uc4=nk4=0%40FY5Qy64y3SsmdbiPdQMfcOrjzIuaMg%3D%3D&id4=0%40U2LK%2FwS6ehmB45%2B%2FKSl4I7%2FD3ptE; tracknick=tbn60165318; _cc_=U%2BGCWk%2F7og%3D%3D; enc=vdrXdFKY2xozxiBQPzFgqdULeLbSJ%2FZphd1xAWDpxUmZxatQDDmH%2B34zrqcdQ4rwpZrtlQRAQqyFPuQS%2FDL%2FVg%3D%3D; mt=ci=0_1; thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; cookie2=7769de3eba14a2d9ac704fb765673036; _tb_token_=39e8e34355a48; JSESSIONID=546A2DF4811D091AEB4BF5C0C57FF8DE; tfstk=cTJlBRjtQQ55nU5D1Y6WXdfZuOcAZCEPP1C23KRqeRichtJViAV4_bZ8PZohmW1..; l=eBgyZQlROoK9Q3ooBOfwourza77OSIRAguPzaNbMiOCPOt1p5amAWZ5U_QY9C3GVh6qBR3l63A9LBeYBqCvan5U62j-la_kmn; isg=BAkJZSbcsjYTsE7Cj68GIl6uGDVjVv2IE5HXp6t-hfAv8ikE86YNWPcoNFbEqpXA", "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36" } request = urllib.request.Request(url,headers=head) html = "" try: response = urllib.request.urlopen(request) html=response.read().decode('utf-8') except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) return html def getData(baseurl,keyword,deep): #print(keyword) #File=open(FileName,'w',encoding='utf-8') data=[] for page in range(1,deep): keys={'q':keyword,'s':page} pageurl=baseurl+urlencode(keys) print(pageurl) html=askURL(pageurl) #print(pageurl) title=re.findall(FindTitle,html) price=re.findall(FindPrice,html) sale=re.findall(FindSale,html) Comment=re.findall(FindComment,html) ID=re.findall(FindNid,html) #print(len(title),len(price)) Lenth=min(len(title),len(price),len(sale),len(Comment),len(ID)) for item in range(Lenth): data.append([title[item],price[item],sale[item],Comment[item],ID[item]]) return data def SaveData(keyword,data): Save = pandas.DataFrame(data,columns=['title','price','sale','comment','id']) Save.to_csv('%s.csv'%keyword,encoding='utf_8_sig',index=False) def ReadData(key): data = pandas.read_csv('%s.csv'%key,encoding='utf_8_sig',engine='python') return data def PriceFigure(key): data=ReadData(key) plt.rcParams['font.family']=['sans-serif'] plt.rcParams['font.sans-serif']=['SimHei'] plt.figure(figsize=(5,5)) plt.hist(data['price'],bins=20,alpha=0.6) plt.xlabel('价格') plt.ylabel('频数') plt.savefig('%s价格分布.png'%key) def Hottopic(key): data=ReadData(key) content='' for i in range(len(data)): content+=data['title'][i] word=" ".join(jieba.cut(content,cut_all=True)) Topic=WordCloud('simhei.ttf',background_color="black",width=1000,height=600).generate(word) Topic.to_file('%s.png'%key)