import pandas as pd from Bio.PDB import PDBParser from Bio.PDB.SASA import ShrakeRupley import Bio.PDB def parsePDB(path): # full path to the pdb file # kdh dict kdh_dict = {"asp" : -3.5, "glu" : -3.5, "asn" : -3.5, "gln" : -3.5, "lys" : -3.9, "arg" : -4.5, "his" : -3.2, "gly" : -0.4, "pro" : -1.6, "ser" : -0.8, "thr" : -0.7, "cys" : 2.5, "met" : 1.9, "mse" : 1.9, "ala" : 1.8, "val" : 4.2, "ile" : 4.5, "leu" : 3.8, "phe" : 2.8, "trp" : -0.9, "tyr" : -1.3,} # create a dataframe column_names = ["type", "atomnumber", "atom", "resn", "chain", "resi", "coords", "occupancy", "bfactor", "sasa", "kdh", "phi", "psi"] # create dict t = {} for i in column_names: t[i] = [] # calculate sasa # from: https://biopython.org/docs/dev/api/Bio.PDB.SASA.html p = PDBParser(QUIET=1) struct = p.get_structure("pdb", path) sr = ShrakeRupley() sr.compute(struct, level="S") # calculate phi/psi for model in Bio.PDB.PDBParser(QUIET=1).get_structure("t", "1trn.pdb") : dihedral = {} for chain in model : dihedral[str(chain.id)] = {} poly = Bio.PDB.Polypeptide.Polypeptide(chain) phi_psi = poly.get_phi_psi_list() for res_index, residue in enumerate(poly) : dihedral[str(chain.id)][residue.id[1]] = phi_psi[res_index] # read the pdb information with open(path) as f: for line in f.readlines(): if line[0:4] == "ATOM" or line[0:6] == "HETATM": # sasa for this atom try: sasa = struct[0][line[21].strip()][int(line[22:26].strip())][line[12:16].strip()].sasa except: sasa = float("NaN") # dihedral angles for this atom (even though, this is a residue property) angles = dihedral[line[21].strip()][int(line[22:26].strip())] if angles[0] == None: phi = "NaN" else: phi = float(angles[0]) if angles[1] == None: psi = "NaN" else: psi = float(angles[1]) # kdh value if line[17:20].strip().lower() in kdh_dict: kdh = kdh_dict[line[17:20].strip().lower()] else: kdh = float("NaN") # get the information from the pdb file itself atom = [line[0:6].strip(), # atom type int(line[6:11].strip()), # atom number line[12:16].strip(), # atom line[17:20].strip(), # resn line[21].strip(), # chain int(line[22:26].strip()), # resi [float(line[30:38].strip()), float(line[38:46].strip()), float(line[46:54].strip())], # coords float(line[54:60].strip()), # occupancy float(line[60:66].strip()), # bfactor sasa, # sasa kdh, # kdh phi, # phi psi # psi ] # append list to dict for i, val in enumerate(atom): t[column_names[i]].append(val) #print(t) df = pd.DataFrame(t, columns = column_names) return df x = parsePDB("1trn.pdb") x.to_csv("result.csv") print(x)