采集商品分类信息
from selenium.webdriver.common.action_chains import ActionChains from pyquery import PyQuery as pq from selenium import webdriver from selenium.webdriver.chrome.options import Options import time # 抓取分类数据 def tianmao_catch_category(): driver = get_driver('', False) try: url = 'https://www.tmall.com/?ali_trackid=2:mm_26632258_3504122_55934697:1609295236_235_1586302010&union_lens=recoveryid:1609295236_235_1586302010&clk1=3a059b6fd5d21a5e9086e711fdf3afe4&bxsign=tbkJxFfRkMJdwE3OwpP483v2+4G1PrzCDIDumBW7tv5QzQfc+xlm3i2oiRMn2bJl4qaPrxH6ekD1p3hgS1sBUJbM4REq9LyuFhLBITi5yXSBSs=' driver.get(url) time.sleep(10) # spans = doc("div[id='imgid']").find("div[class='imgpage']").find( # "ul[class^='imglist clearfix pageNum']").find("li[class='imgitem']") # spans = doc("ul[class='normal-nav clearfix']").find("li[class^='j_MenuNav nav-item nav-item-']") spans=driver.find_elements_by_xpath("//ul[@class='normal-nav clearfix']/li") isbreak = False count1 = 0 count2 = 0 list1 = [] for span in spans: # 鼠标事件 ActionChains(driver).move_to_element(span).perform() data_title = str(span.text).replace(' /', '/').strip().replace('� ','') # 删除类似ue615 字符 ts = data_title.split(' ') if len(ts)==1: list1.append(ts[0]) elif len(ts)==2: list1.append(ts[1]) time.sleep(3) selenium_html = driver.execute_script("return document.documentElement.outerHTML") doc = pq(selenium_html) sub_spans = doc("div[class='content-con j_categoryContent']").find( "div[class='pannel-con j_CategoryMenuPannel']").find("div[class^='pannel-']") print(' ') index = 0 netname = '天猫' for sp in sub_spans.items(): category_one = list1[index] index += 1 two_item = sp.find("div[class='hot-word-con']").find("div[class='hot-word-line']") for ts in two_item.items(): category_two = ts.find("div[class='line-title']").find("div[class='title-text']").text() sps = ts.find("div[class='line-con']").find("a[class^='hot-word']") for sp in sps.items(): category_three = sp.text() print(category_one, category_two, category_three) db.saveCategory(netname, category_one, category_two, category_three) print(' ') except Exception as ex: print(ex) driver.quit() # 抓取分类数据 def jingdong_catch_category(): driver = get_driver('', False) # proxy_one = ip_read() # driver = get_driver(proxy_one, False) try: url = 'https://www.jd.com/?cu=true&utm_source=baidu-pinzhuan&utm_medium=cpc&utm_campaign=t_288551095_baidupinzhuan&utm_term=0f3d30c8dba7459bb52f2eb5eba8ac7d_0_48ba7a220ee5462c97fc2d5f3691e5c5' driver.get(url) # selenium_html = driver.execute_script("return document.documentElement.outerHTML") # doc = pq(selenium_html) time.sleep(10) # spans = doc("div[id='imgid']").find("div[class='imgpage']").find( # "ul[class^='imglist clearfix pageNum']").find("li[class='imgitem']") # spans = doc("ul[class='normal-nav clearfix']").find("li[class^='j_MenuNav nav-item nav-item-']") spans = driver.find_elements_by_xpath("//ul[@class='JS_navCtn cate_menu']/li[@class='cate_menu_item']") list1 = [] for span in spans: ActionChains(driver).move_to_element(span).perform() data_title = str(span.text).replace('/ ', '/').replace(' /', '/').strip().replace('� ', '') print('data_title=',data_title) list1.append(data_title) time.sleep(3) selenium_html = driver.execute_script("return document.documentElement.outerHTML") doc = pq(selenium_html) sub_spans = doc("div[id='J_popCtn']").find("div[class='cate_part clearfix']") print(' ') index = 0 netname = '京东' for sp in sub_spans.items(): category_one = list1[index] two_item = sp.find("div[class='cate_part_col1']").find("div[class='cate_channel']").find("a[class='cate_channel_lk']") index1 = 0 category_two='' for ts in two_item.items(): category_three='' if index1==0: category_two = str(ts.text()) else: category_three= str(ts.text()) print(category_one, category_two, category_three) db.saveCategory(netname, category_one, category_two, category_three) index1+=1 two_item = sp.find("div[class='cate_part_col1']").find("div[class='cate_detail']").find( "dl[class^='cate_detail_item cate_detail_item']") index1 = 0 category_two = '' for ts in two_item.items(): category_three = '' if index1 == 0: category_two = str(ts.find("dt[class='cate_detail_tit']").find("a[class='cate_detail_tit_lk']").text()) else: sps = ts.find("dd[class='cate_detail_con']").find("a[class='cate_detail_con_lk']") for sp in sps.items(): category_three = str(sp.text()) print(category_one, category_two, category_three) db.saveCategory(netname, category_one, category_two, category_three) index1 += 1 index += 1 print(' ') print(index) except Exception as ex: print(ex) driver.quit()