下面是我自己最近所学习的知识。
之前自己处理的数据集发现了大问题,现在又重新处理。
关于从pdb数据库中提取一级序列
定义一个PDBparser类,定义parsePrimaryStructure()函数,用pdbparser.parsePrimaryStructure(file)实现一级序列提取的功能。
import json; class PDBparser(): def parse(self,PDBfile,indent = False): ''' @return: The full information parsed from the PDBFile @param PDBFile: The full path of the PDB file, str ''' content = {} # The full information parsed from the PDBFile priStructure = content['primary_structure'] = [] # The primary structure parsed from the PDBFile tertiary_structure = content['tertiary_structure'] ={} terStructure = tertiary_structure['chains'] = [] # The teritary structure parsed from the PDBFile #residueindexlist = set() lines = self.__loadPDBfile(PDBfile) langthinformation = self.__parseLine(lines,content) if indent: return content,langthinformation # for test else: return content def parsePrimaryStructure(self,PDBfile): ''' @return: The full information parsed from the PDBFile @param PDBFile: The full path of the PDB file, str ''' priStructure = []; # The primary structure parsed from the PDBFile lines = self.__loadPDBfile(PDBfile) for line in lines: header = line.split()[0] if header == 'SEQRES': self.__parsePriLine(line, priStructure) return priStructure
定义main函数
if __name__ == "__main__": print('please input the path of the PDBfile:such as E:\pdb\pdb3rum.ent')#输入pdb文件 file=input() pdbparser = PDBparser() #content = pdbparser.parseTertiaryStructure(file) content = pdbparser.parsePrimaryStructure(file) formatinput = json.dumps(content, indent=1) print(formatinput) print("Done")