# -*- coding: utf-8 -*-
# @Author : xuchunlin
# @Time : 2020/7/20 10:41
# @License : Copyright(C),Drcnet
# from common.contest import *
from selenium import webdriver
import time
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.keys import Keys
# from common.contest import logger
driver = webdriver.Firefox()
driver.get('http://www.customs.gov.cn/customs/302249/302274/302277/index.html')
# driver.get('http://www.customs.gov.cn/customs/302249/302274/302277/302276/310398/index.html')
time.sleep(20)
result = driver.page_source
# print(result)
from bs4 import BeautifulSoup
soup = BeautifulSoup(result,'html.parser')
result_mainBox = soup.select('div#mainBox')[0]
import re
def replace_br(newline):
"""
# 用正则过滤掉网页的注释并替换掉掉换行符
:param newline: 传入一个字符串,过滤掉网页并且换行
:return:
"""
newline = str(newline)
newline = newline.replace('
','').replace('
','').replace('
','').replace('
','').replace(' ','').replace('
','').replace('amp;','').replace('<br/>','').replace('<br>','')
re_comment = re.compile('<!--[^>]*-->')
newlines = re_comment.sub('', newline)
newlines = newlines.replace('<!--','').replace('-->','')
return newlines
result_mainBox_replace = replace_br(result_mainBox)
driver.close()
result_list = re.findall('<tbody><tr align="center" height="25"(.*?)</tr></tbody>',str(result_mainBox_replace))
print(len(result_list))
for item in result_list:
if 'TEXT-INDENT: 5px' in str(item):
print(1111111111)
print(str(item).replace("
",'').replace("
",'').replace(" ",''))
time.sleep(2222)