• PSSM特征-从生成到处理


    以下代码均为个人原创,如有疑问,欢迎交流。新浪微博:拾毅者

    本节内容:

    • pssm生成
    • pssm简化
    • 标准的pssm构建
    • 滑动pssm生成

    在基于蛋白质序列的相关预測中。使用PSSM打分矩阵会得将预測效果大大提高,同一时候,假设使用滑动的PSSM,效果又会进一步提高。这里主要以分享代码为主,以下介绍下PSSM从生成到处理的全过程。

    1.PSSM的生成

    PSSM的生成有多种方式,这里使用的psiblast软件。ncbi-blast-2.2.28+/bin/psiblast。下载地址:http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastNews#1 用法。输入一个序列,加上相关參数,直接输出PSSM文件

    代码

    #一个命令函数,依据pdb文件。使用blast生成pssm文件
    def command_pssm(content, output_file,pssm_file):
        os.system('/ifs/share/lib/blast/ncbi-blast-2.2.28+/bin/psiblast 
                    -query %s 
                    -db /ifs/data/database/blast_data/nr 
                    -num_iterations 3 
                    -out %s 
                    -out_ascii_pssm %s &' %(content, output_file,pssm_file))

    上面是运行的命令,封装成函数,以下为实际代码:

    #对每一个序列进行PSSM打分
    def pssm(proseq,outdir):
        inputfile = open(proseq,'r')
        content = ''
        input_file = ''
        output_file = ''
        pssm_file = ''
        chain_name = []
        for eachline in inputfile:
            if '>' in eachline:
                if len(content):
                    temp_file = open(outdir + '/fasta/' + chain_name,'w')
                    temp_file.write(content)
                    input_file = outdir + '/fasta/' + chain_name
                    output_file = outdir + '/' + chain_name + '.out'
                    pssm_file = outdir + '/' + chain_name + '.pssm'                
                    command_pssm(input_file, output_file,pssm_file)
                    temp_file.close
                content = ''
                chain_name = eachline[1:5] + eachline[6:7]
            content +=  ''.join(eachline)
            #print content
            #print chain_name
        if len(content):
            temp_file = open(outdir + '/fasta/' + chain_name,'w')
            temp_file.write(content)
            input_file = outdir + '/fasta/' + chain_name
            output_file = outdir + '/' + chain_name + '.out'
            pssm_file = outdir + '/' + chain_name + '.pssm'
            command_pssm(input_file, output_file,pssm_file)  
            temp_file.close
        inputfile.close()       

    測试用例:

    '''            
    #生成pssm文件,迭代次数为3       
    proseq = '/ifs/home/liudiwei/experiment/step2/data/protein.seq'
    outdir = '/ifs/home/liudiwei/experiment/step2/pssm'
    pssm(proseq,outdir)
    '''

    PSSM输出例子:

    2.简化PSSM数据

    通常我们须要的仅仅是前面的20列

    以下通过代码来实现上面的功能:

    #格式化pssm每行数据
    def formateachline(eachline):
        col = eachline[0:5].strip()
        col += '	' + eachline[5:8].strip()    
        begin = 9 
        end = begin +3
        for i in range(20):     
            begin = begin
            end = begin + 3
            col += '	' + eachline[begin:end].strip()
            begin = end
        col += '
    '
        return col

    简化pssm。仅仅要得到前面的20个氨基酸的打分值

    def simplifypssm(pssmdir,newdir):
        listfile = os.listdir(pssmdir)
        for eachfile in listfile:
            with open(pssmdir + '/' + eachfile,'r') as inputpssm:
                with open(newdir + '/' + eachfile,'w') as outfile:
                    count = 0
                    for eachline in inputpssm:
                        count +=1
                        if count <= 3:
                            continue
                        if not len(eachline.strip()):
                            break
                        oneline = formateachline(eachline)
                        outfile.write(''.join(oneline))
    ''' Test example
    pssmdir = '/ifs/home/liudiwei/experiment/step2/pssm/oldpssm'
    newdir = '/ifs/home/liudiwei/experiment/step2/pssm/newpssm'
    simplifypssm(pssmdir, newdir)
    '''

    3.得到标准的PSSM

    通过上面抽取出来的PSSM,以下通过代码来获得一个滑动的PSSM

    #标准的pssm,直接依据标准的pssm滑动
    def standardPSSM(window_size,pssmdir,outdir):
        listfile = os.listdir(pssmdir)
        for eachfile in listfile:
            outfile = open(outdir + '/' + eachfile, 'w') 
            with open(pssmdir + '/' + eachfile, 'r') as inputf:
                inputfile = inputf.readlines()
                for linenum in range(len(inputfile)):
                    content = []
                    first = [];second = [];third=[];last=[]
                    if linenum < window_size/2:
                        for i in range((window_size/2 - linenum)*20):
                            second.append('	0')
                    if window_size/2 - linenum > 0:
                        countline = window_size - (window_size/2 - linenum)
                    else:
                        countline = window_size  #get needed line count
    
                    linetemp = 0
                    for eachline in inputfile:
                        if linetemp < linenum-window_size/2:
                            linetemp += 1
                            continue
                        if linetemp == linenum:
                            thisline = eachline.split('	')
                            for j in range(0,2):
                                if j>0:
                                    first.append('	')
                                first.append(thisline[j].strip())
                        if countline > 0:
                            oneline = eachline.split('	')
                            for j in range(2,len(oneline)):
                                third.append('	' + oneline[j].strip())
                            countline -=1
                        else:
                            break
                        linetemp += 1
                    while countline:
                        for i in range(20):
                            last.append('	0')
                        countline -=1
                    content += first + second + third + last
                    outfile.write(''.join(content) + '
    ')
            outfile.close()
    
    '''Test example
    pssmdir = '/ifs/home/liudiwei/experiment/step2/pssm/newpssm'
    newdir = '/ifs/home/liudiwei/experiment/step2/pssm/standardpssm'
    window_size = 5
    standardPSSM(window_size,pssmdir, newdir)
    '''

    4.依据滑动窗体求出滑动的PSSM

    #依据窗体大小,计算出滑动后的20个氨基酸打分值
    def computedPSSM(window_size,pssmdir,outdir):
        listfile = os.listdir(pssmdir)
        for eachfile in listfile:
            outfile = open(outdir + '/' + eachfile, 'w') 
            with open(pssmdir + '/' + eachfile, 'r') as inputf:
                inputfile = inputf.readlines()
                for linenum in range(len(inputfile)):
                    content = []
                    first = [];second = []
                    if window_size/2 - linenum > 0:
                        countline = window_size - (window_size/2 - linenum)
                    else:
                        countline = window_size  #get needed line count
    
                    linetemp = 0
                    for eachline in inputfile:
                        if linetemp < linenum-window_size/2:
                            linetemp += 1
                            continue
                        if linetemp == linenum:
                            thisline = eachline.split('	')
                            for j in range(0,2):
                                if j>0:first.append('	')
                                first.append(thisline[j].strip())
                        if countline > 0: 
                            oneline = eachline.split('	')[2:len(eachline)]
                            tline = []
                            for i in range(len(oneline)):
                                tline.append(int(oneline[i]))
                            if len(second)==0:
                                second += tline
                            else:
                                second = list(map(lambda x: x[0]+x[1], zip(second, tline)))
                            countline -=1 
                        else:
                            break 
                        linetemp += 1 
                    format_second = []
                    for i in range(len(second)):
                        format_second.append('	' + str(second[i]))
                    content += first + format_second 
                    outfile.write(''.join(content) + '
    ')
            outfile.close()
    '''      
    pssmdir = '/ifs/home/liudiwei/experiment/step2/pssm/newpssm'
    newdir = '/ifs/home/liudiwei/experiment/step2/pssm/computedpssm'
    window_size = 5
    computedPSSM(window_size,pssmdir, newdir)
    '''

    平滑的PSSM,仅仅是pssmdir不同,直接调用standardPSSM函数

    def smoothedPSSM(window_size,pssmdir,outdir):
        standardPSSM(window_size,pssmdir, outdir)
    
    '''Test example
    pssmdir = '/ifs/home/liudiwei/experiment/step2/pssm/computedpssm'
    newdir = '/ifs/home/liudiwei/experiment/step2/pssm/smoothedpssm'
    window_size = 5
    smoothedPSSM(window_size,pssmdir,newdir)
    '''

    最后得到的是一个滑动的PSSM矩阵,特征的维数随窗体的大小逐渐增减。



    本栏目python代码分享持续更新中,欢迎关注:Dream_Angel_Z博客 新浪微博:拾毅者


  • 相关阅读:
    2016年3月3日
    性能测试之我解
    Vim命令合集
    vi-vim常用命令
    架构的本质是
    网站三层架构学习之一 分层式结构
    Spring 4 + Quartz 2.2.1 Scheduler Integration Example
    “城市民族工作条例”详解--建议废止
    字符串匹配处理
    LogBack简易教程
  • 原文地址:https://www.cnblogs.com/jhcelue/p/7248465.html
Copyright © 2020-2023  润新知