盖德化工采集新方案

Python爬虫视频教程零基础小白到scrapy爬虫高手-轻松入门

https://item.taobao.com/item.htm?spm=a1z38n.10677092.0.0.482434a6EmUbbW&id=564564604865

# -*- coding: utf-8 -*-
"""
Created on Sun May 15 20:41:32 2016

@author: daxiong
"""
 
import requests,bs4,csv,time,random,os
 
#存放所有二级网址
fileName='combinedFile.csv'
#存放二级网址目录

site_hubei="http://china.guidechem.com/suppliers/list_catid-21_area-%E6%B9%96%E5%8C%97"
pages_hubei=31

def Get_sites(site,pages):
    list_pages=[]
    for page in range(1,pages+1):
        thePage=site+"-"+"p"+str(page)+".html"
        list_pages.append(thePage)
    
    return list_pages

#获取每一页20个公司信息存储在list_rows_information
def Get_page_information(elems):
    #每一页20个公司信息存储在list_rows_information里面
    list_rows_information=[]
    num=len(elems)
    for i in range(num):
        try:
            #公司名称
            elems_company_name=elems[i].select(".dblue")
            company_name=elems_company_name[0].text
            #主要产品
            elems_main_product=elems[i].select("li")
            main_product=elems_main_product[1].text.strip("
")
            #联系方式
            elems_contact=elems[i].select(".site_l")
            content_contact=elems_contact[0].text
            content_contact1=content_contact.strip("

	
")
            content_contact2=content_contact1.strip("
")
            list_content_contact=content_contact2.split("

")
            phone=list_content_contact[0]
            address=list_content_contact[1]
            list_rows_information.append([company_name,main_product,phone,address])
        except:
            print("error at:",i)
            continue
    return list_rows_information


 
#把一页内容写入csv文档 ,list_tableContent为二维列表[[a],[b],[c]]
def Write_table_to_csv(fileName,list_tableContent):
    #对列表格式修改，字符串写入的格式不对
    file=open(fileName,'w',newline='')
    writer1=csv.writer(file)
    writer1.writerows(list_tableContent)
    file.close() 
     
#写入所有文件
def Write_allTables_to_csvs(list_pages):
    for i in range(pages_hubei):
        try:
            res=requests.get(list_pages[i])
            soup=bs4.BeautifulSoup(res.text,"lxml")
            #综合信息
            elems=soup.select(".clist_list_content_r")
            #获取每一页20个公司信息存储在list_rows_information
            list_rows_information=Get_page_information(elems)
            filename=str(i+1)+".csv"
            Write_table_to_csv(filename,list_rows_information)
            time.sleep(random.randint(10,15))
        except:
            print("error at:",i)
            continue
#主函数
#获取32页主要网址        
list_pages=Get_sites(site_hubei,pages_hubei)

'''
#生产所有csv文件         
Write_allTables_to_csvs(list_pages)
'''

i=3
res=requests.get(list_pages[i])
soup=bs4.BeautifulSoup(res.text,"lxml")
elems=soup.select(".clist_list_content_r")
#联系方式
elems_contact=elems[2].select(".site_l")
content_contact=elems_contact[0].text
content_contact1=content_contact.strip("

	
")
content_contact2=content_contact1.strip("
")
list_content_contact=content_contact2.split("

")
#有时候信息会缺失，用正则表达式筛选text内容
phone=list_content_contact[0]
address=list_content_contact[1]

i=2和i=9错误

相关阅读:
迁移式学习
 VMware Workstation 16激活码
 OpenStack安装部署
 git码云操作
 vs 2019 正则替换
 linux中Redis单机安装
 ASP.NET/C#执行数据库过程函数带RETURN的项目接收。
IDEA配置部属Tomcat
Java集合之HashMap源码分析(put()方法)
反编译一款APP然后重新打包（Windows环境）
原文地址：https://www.cnblogs.com/webRobot/p/5496377.html