''' SELECT * FROM Info_Roles WHERE Flag=1 LIMIT 2; select top y * from 表 where 主键 not in(select top (x-1)*y 主键 from 表) 如果表中无主键,可以用临时表,加标识字段解决.这里的x,y可以用变量. select id=identity(int,1,1),* into #tb from 表 select * from #tb where id between (x-1)*y and x*y-1 select top 1000 Info_ID from Info_Roles select top 2000 Info_ID,',xiaole20180410SPLIT,',content from Info_Content where Info_ID not in( select top 1000 Info_ID from Info_Roles ) ; select top 399 Info_ID,',xiaole20180410SPLIT,',UPPER(content) from Info_Content ; select top 399 CHARINDEX('IMG',UPPER(content)) from Info_Content ; select top 15 Info_ID,',xiaole20180410SPLIT,',content from Info_Content where CHARINDEX('IMG',UPPER(content))>0; select top 15 Info_ID,',xiaole20180410SPLIT,',content from Info_Content where Info_ID in( select top 1000 Info_ID from Info_Roles ) and CHARINDEX('IMG',UPPER(content))>0 ; SELECT TOP 15 Info_ID, ',xiaole20180410SPLIT,', content FROM Info_Content WHERE Info_ID IN ( SELECT TOP 1000 Info_ID FROM Info_Roles WHERE Flag = 1 ) AND CHARINDEX('IMG', UPPER(content)) > 0; SELECT TOP 200 Info_ID, ',xiaole20180410SPLIT,', content FROM Info_Content WHERE Info_ID IN ( SELECT TOP 90000 Info_ID FROM Info_Roles ) AND CHARINDEX('<IMG', UPPER(content)) > 0; ''' from bs4 import BeautifulSoup from selenium import webdriver xlsplit_str = ',xiaole20180410SPLIT,' xlsplit_str = ',xiaole20180410SPLIT,' f_db_txt, uid_d = 'db.uid.para.txt', {} f_db_txt, uid_d = 'db.uid.para.byhand.txt', {} uid_ = 0 # uid = '{}{}'.format('byhand', uid_) # uid_d[uid]={} with open(f_db_txt, 'r', encoding='utf-8') as fr: for i in fr: i = i.replace(' ', '').replace(' ', '') if xlsplit_str in i: l = i.split(xlsplit_str) # uid = l[0].replace(' ', '') # uid = l[0].replace(' ', '') uid_ += 1 uid = '{}{}'.format('byhand', uid_) uid_d[uid] = {} # uid_d[uid]['html'] = [] # uid_d[uid]['html'].append(l[1]) uid_d[uid]['html'] = l[1] else: # uid_d[uid]['html'].append(i) uid_d[uid]['html'] = '{}{}'.format(uid_d[uid]['html'], i) r_d = {} ''' 中文分句 ''' cutlist = ['。', ';', '?', '.', ';', '?', '...', '、、、', ':', ':', ',', ','] # 检查某字符是否分句标志符号的函数;如果是,返回True,否则返回False def FindToken(cutlist, char): if char in cutlist: return True else: return False # 进行分句的核心函数 def Cut(cutlist, lines): # 参数1:引用分句标志符;参数2:被分句的文本,为一行中文字符 l = [] # 句子列表,用于存储单个分句成功后的整句内容,为函数的返回值 line = [] # 临时列表,用于存储捕获到分句标志符之前的每个字符,一旦发现分句符号后,就会将其内容全部赋给l,然后就会被清空 for i in lines: # 对函数参数2中的每一字符逐个进行检查 (本函数中,如果将if和else对换一下位置,会更好懂) if FindToken(cutlist, i): # 如果当前字符是分句符号 line.append(i) # 将此字符放入临时列表中 l.append(''.join(line)) # 并把当前临时列表的内容加入到句子列表中 line = [] # 将符号列表清空,以便下次分句使用 else: # 如果当前字符不是分句符号,则将该字符直接放入临时列表中 line.append(i) return l ''' ''' def paragraph_to_sentence(paragraph, sentence_l): paragraph = paragraph.replace(' ', '') sentence_split_l = [' ', ' ', '。', ';', '?', '.', ';', '?', '...', '、、、', ',', ','] for i in sentence_split_l: ll = paragraph.split(i) sentence_l.append(ll[0]) if len(ll) > 1: paragraph_to_sentence(ll[1], sentence_l) else: break return sentence_l def paragraph_to_sentence_no_recursion(paragraph, sentence_l): paragraph = paragraph.replace(' ', '') sentence_split_l = [' ', ' ', '。', ';', '?', '.', ';', '?', '...', '、、、', ',', ','] for i in sentence_split_l: ll = paragraph.split(i) sentence_l.append(ll[0]) if len(ll) > 1: paragraph_to_sentence(ll[1], sentence_l) else: break return sentence_l paragraph = '' sentence_l = [] paragraph = paragraph.replace(' ', '') sentence_split_l = [' ', ' ', '。', ';', '?', '.', ';', '?', '...', '、、、', ',', ','] for i in sentence_split_l: ll = paragraph.split(i) sentence_l.append(ll[0]) if len(ll) > 1: paragraph_to_sentence(ll[1], sentence_l) else: break def sentence_l_to_sentence_l_l(sentence_l): sentence_l_l = [] sentence_split_l = [' ', ' ', '。', ';', '?', '.', ';', '?', '...', '、、、', ',', ','] for i in sentence_l: for ii in sentence_split_l: ll = i.split(ii) if len(ll) > 1: sentence_l_l += ll else: sentence_l_l.append(i) continue return sentence_l_l import requests, time, threading img_dir = 'C:\Users\sas\PycharmProjects\py_win_to_unix\crontab_chk_url\personas\trunk\plugins\spider\dl_img_tmp\' img_dir = 'C:\Users\sas\PycharmProjects\produce_video\mypng\' import random import os, time, glob os_sep = os.sep this_file_abspath = os.path.abspath(__file__) this_file_dirname, this_file_name = os.path.dirname(this_file_abspath), os.path.abspath(__file__).split(os_sep)[ -1] fw_f = '{}{}'.format(this_file_name, '.txt') fw_f_onerow = '{}{}'.format(fw_f.replace( '.txt',''), '.txt') with open(fw_f, 'w', encoding='utf-8') as fw_txt: with open(fw_f_onerow, 'w', encoding='utf-8') as fw_txt_onerow: for uid in uid_d: str_ = uid_d[uid]['html'] fhtml = 'qqzong.vedio.allinone.tmp.html' fhtml = '{}{}{}{}'.format('D:\myv\myhtml\', int(time.time()), random.randint(1234, 6789), fhtml) with open(fhtml, 'w', encoding='utf-8') as fw: fw.write(str_) with open(fhtml, 'r', encoding='utf-8') as fo: soup = BeautifulSoup(fo, 'html.parser') sentence_l = Cut(list(cutlist), list(soup.text)) # 过滤句子单条长度 条数 sen_num = 32 sen_pass = False if len(sentence_l) < sen_num: sen_pass = True continue for sen in sentence_l: if len(sen) > 64: sen_pass = True break if sen_pass: continue s = '{}{}{}'.format('-----------------------', uid, '---------------------------------------- ') fw_txt.write(s) fw_txt_onerow(s) n = 0 for sen in sentence_l: s = '{}{}'.format(sen, ' ') print(s) fw_txt.write(s) n += 1 if n == 31: break # 联系方式:王经理13212312312 fw_txt_onerow(''.join(sentence_l[0:31])) s = '{}{}{}'.format('联系方式:王经理', random.randint(13200000000, 15812341234), ' ') fw_txt.write(s) fw_txt_onerow(s) dd = 9