• _io.TextIOWrapper


    '''
    SELECT * FROM Info_Roles WHERE Flag=1 LIMIT 2;
    
    
     select   top   y   *   from   表   where   主键   not   in(select   top   (x-1)*y   主键   from   表)
    
    
    
      如果表中无主键,可以用临时表,加标识字段解决.这里的x,y可以用变量.
    
      select   id=identity(int,1,1),*     into   #tb   from   表
      select   *   from   #tb   where   id   between   (x-1)*y   and   x*y-1
    
    
    
    
     select   top   1000   Info_ID   from   Info_Roles
     select   top   2000   Info_ID,',xiaole20180410SPLIT,',content   from   Info_Content   where   Info_ID   not   in( select   top   1000   Info_ID   from   Info_Roles   )   ;
     select   top   399   Info_ID,',xiaole20180410SPLIT,',UPPER(content)   from   Info_Content      ;
     select   top   399   CHARINDEX('IMG',UPPER(content))   from   Info_Content      ;
     select   top   15   Info_ID,',xiaole20180410SPLIT,',content   from   Info_Content   where  CHARINDEX('IMG',UPPER(content))>0;
     select   top   15   Info_ID,',xiaole20180410SPLIT,',content   from   Info_Content   where
     Info_ID      in( select   top   1000   Info_ID   from   Info_Roles   )  and
      CHARINDEX('IMG',UPPER(content))>0
     ;
    
    
    
    SELECT
    	TOP 15 Info_ID,
    	',xiaole20180410SPLIT,',
    	content
    FROM
    	Info_Content
    WHERE
    	Info_ID IN (
    		SELECT
    			TOP 1000 Info_ID
    		FROM
    			Info_Roles
    		WHERE
    			Flag = 1
    	)
    AND CHARINDEX('IMG', UPPER(content)) > 0;
    
    
    
    
    
    SELECT
    	TOP 200 Info_ID,
    	',xiaole20180410SPLIT,',
    	content
    FROM
    	Info_Content
    WHERE
    	Info_ID IN (
    		SELECT
    			TOP 90000 Info_ID
    		FROM
    			Info_Roles
    	)
    AND CHARINDEX('<IMG', UPPER(content)) > 0;
    
    
    
    '''
    
    from bs4 import BeautifulSoup
    from selenium import webdriver
    
    xlsplit_str = ',xiaole20180410SPLIT,'
    xlsplit_str = ',xiaole20180410SPLIT,'
    f_db_txt, uid_d = 'db.uid.para.txt', {}
    f_db_txt, uid_d = 'db.uid.para.byhand.txt', {}
    uid_ = 0
    # uid = '{}{}'.format('byhand', uid_)
    # uid_d[uid]={}
    with open(f_db_txt, 'r', encoding='utf-8') as fr:
        for i in fr:
            i = i.replace('	', '').replace('
    ', '')
            if xlsplit_str in i:
                l = i.split(xlsplit_str)
                #   uid = l[0].replace(' ', '')
                #  uid = l[0].replace(' ', '')
                uid_ += 1
                uid = '{}{}'.format('byhand', uid_)
                uid_d[uid] = {}
                # uid_d[uid]['html'] = []
                # uid_d[uid]['html'].append(l[1])
                uid_d[uid]['html'] = l[1]
            else:
                #  uid_d[uid]['html'].append(i)
                uid_d[uid]['html'] = '{}{}'.format(uid_d[uid]['html'], i)
    
    r_d = {}
    
    '''
    中文分句
    '''
    cutlist = ['。', ';', '?', '.', ';', '?', '...', '、、、', ':', ':', ',', ',']
    
    
    # 检查某字符是否分句标志符号的函数;如果是,返回True,否则返回False
    def FindToken(cutlist, char):
        if char in cutlist:
            return True
        else:
            return False
    
    
    # 进行分句的核心函数
    def Cut(cutlist, lines):  # 参数1:引用分句标志符;参数2:被分句的文本,为一行中文字符
        l = []  # 句子列表,用于存储单个分句成功后的整句内容,为函数的返回值
        line = []  # 临时列表,用于存储捕获到分句标志符之前的每个字符,一旦发现分句符号后,就会将其内容全部赋给l,然后就会被清空
    
        for i in lines:  # 对函数参数2中的每一字符逐个进行检查 (本函数中,如果将if和else对换一下位置,会更好懂)
            if FindToken(cutlist, i):  # 如果当前字符是分句符号
                line.append(i)  # 将此字符放入临时列表中
                l.append(''.join(line))  # 并把当前临时列表的内容加入到句子列表中
                line = []  # 将符号列表清空,以便下次分句使用
            else:  # 如果当前字符不是分句符号,则将该字符直接放入临时列表中
                line.append(i)
        return l
    
    
    '''
    
    '''
    
    
    def paragraph_to_sentence(paragraph, sentence_l):
        paragraph = paragraph.replace(' ', '')
        sentence_split_l = ['
    ', '	', '。', ';', '?', '.', ';', '?', '...', '、、、', ',', ',']
        for i in sentence_split_l:
            ll = paragraph.split(i)
            sentence_l.append(ll[0])
            if len(ll) > 1:
                paragraph_to_sentence(ll[1], sentence_l)
            else:
                break
    
        return sentence_l
    
    
    def paragraph_to_sentence_no_recursion(paragraph, sentence_l):
        paragraph = paragraph.replace(' ', '')
        sentence_split_l = ['
    ', '	', '。', ';', '?', '.', ';', '?', '...', '、、、', ',', ',']
        for i in sentence_split_l:
            ll = paragraph.split(i)
            sentence_l.append(ll[0])
            if len(ll) > 1:
                paragraph_to_sentence(ll[1], sentence_l)
            else:
                break
    
        return sentence_l
    
    
    paragraph = ''
    sentence_l = []
    paragraph = paragraph.replace(' ', '')
    sentence_split_l = ['
    ', '	', '。', ';', '?', '.', ';', '?', '...', '、、、', ',', ',']
    for i in sentence_split_l:
        ll = paragraph.split(i)
        sentence_l.append(ll[0])
        if len(ll) > 1:
            paragraph_to_sentence(ll[1], sentence_l)
        else:
            break
    
    
    def sentence_l_to_sentence_l_l(sentence_l):
        sentence_l_l = []
        sentence_split_l = ['
    ', '	', '。', ';', '?', '.', ';', '?', '...', '、、、', ',', ',']
        for i in sentence_l:
            for ii in sentence_split_l:
                ll = i.split(ii)
                if len(ll) > 1:
                    sentence_l_l += ll
                else:
                    sentence_l_l.append(i)
                    continue
    
        return sentence_l_l
    
    
    import requests, time, threading
    
    img_dir = 'C:\Users\sas\PycharmProjects\py_win_to_unix\crontab_chk_url\personas\trunk\plugins\spider\dl_img_tmp\'
    img_dir = 'C:\Users\sas\PycharmProjects\produce_video\mypng\'
    import random
    
    import os, time, glob
    
    os_sep = os.sep
    this_file_abspath = os.path.abspath(__file__)
    this_file_dirname, this_file_name = os.path.dirname(this_file_abspath), os.path.abspath(__file__).split(os_sep)[
        -1]
    fw_f = '{}{}'.format(this_file_name, '.txt')
    fw_f_onerow = '{}{}'.format(fw_f.replace( '.txt',''), '.txt')
    
    
    
    
    with open(fw_f, 'w', encoding='utf-8') as fw_txt:
        with open(fw_f_onerow, 'w', encoding='utf-8') as fw_txt_onerow:
            for uid in uid_d:
                str_ = uid_d[uid]['html']
                fhtml = 'qqzong.vedio.allinone.tmp.html'
                fhtml = '{}{}{}{}'.format('D:\myv\myhtml\', int(time.time()), random.randint(1234, 6789), fhtml)
                with open(fhtml, 'w', encoding='utf-8') as fw:
                    fw.write(str_)
                with open(fhtml, 'r', encoding='utf-8') as fo:
                    soup = BeautifulSoup(fo, 'html.parser')
                    sentence_l = Cut(list(cutlist), list(soup.text))
    
                # 过滤句子单条长度 条数
                sen_num = 32
                sen_pass = False
                if len(sentence_l) < sen_num:
                    sen_pass = True
                    continue
    
                for sen in sentence_l:
                    if len(sen) > 64:
                        sen_pass = True
                        break
                if sen_pass:
                    continue
                s = '{}{}{}'.format('-----------------------', uid, '----------------------------------------
    ')
                fw_txt.write(s)
                fw_txt_onerow(s)
                n = 0
                for sen in sentence_l:
                    s = '{}{}'.format(sen, '
    ')
                    print(s)
                    fw_txt.write(s)
                    n += 1
                    if n == 31:
                        break
                # 联系方式:王经理13212312312
                fw_txt_onerow(''.join(sentence_l[0:31]))
                s = '{}{}{}'.format('联系方式:王经理', random.randint(13200000000, 15812341234), '
    ')
                fw_txt.write(s)
                fw_txt_onerow(s)
    
    dd = 9
    

      

  • 相关阅读:
    java学习阶段一 方法和文档注释
    java学习阶段一 二维数组
    java学习阶段一 一维数组
    java学习阶段一 循环结构
    java学习阶段一 选择结构
    java学习阶段一 运算符
    oracle学习笔记:修改表空间文件位置
    oracle学习笔记:重建临时表空间
    oracle等待事件1:Failed Logon delay等待事件
    oracle数据库删除归档日志
  • 原文地址:https://www.cnblogs.com/rsapaper/p/8865182.html
Copyright © 2020-2023  润新知