方法1.
def get_all_news(self, response, file):
rm_list = [
# r'</',
# r'p>',
# r'<',
]
ret = response.xpath(
'//p[not (img)][not (a)][position()>5] | //div[contains(@class,"WB_text")]').extract() # 读取p标签但是不含有img的
for i in ret:
for rm in rm_list:
if i[:-1].strip():
if rm in i:
i = i.replace(rm, '').strip()
else:
i = ''
file.write(i + '\t\n')
方法2.
PRE_PROCESSING_EXPRESSION=\
(
('[-]','负'),
('[\(]', '左括号'),
('[\)]' , '右括号')
#......
)
def processSpeakingTxt(txtStr):
global PRE_PROCESSING_EXPRESSION
for (reg_Expression, replaceingStr) in PRE_PROCESSING_EXPRESSION:
if re.search(reg_Expression, txtStr) is not None:
txtStr = re.sub(reg_Expression, replaceingStr, txtStr)
return txtStr
方法3. 修改文件名
def change_filename2(path,str):
files = [f for f in os.listdir(path) if str in f]
print(files)
for f in files:
g= f.replace(str,'')
os.chdir(path)
try:
if not os.path.exists(g):
os.rename(f, g)
except Exception as e:
print(e)