Module library.parser
Expand source code
import os
from library.classes.dataset import Dataset
from Bio.PDB.PDBIO import Select
from Bio.PDB.PDBIO import PDBIO
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB import Structure, Residue, Atom, Model, Chain
import tensorflow as tf
from library.static.topologies import DOPC_CG_NAME_TO_TYPE_MAP, DOPC_BEAD_TYPE_NAME_IDS, DOPC_ELEMENT_TYPE_NAME_IDS
import numpy as np
import time
from datetime import datetime
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
def find_all_pdb_files(path):
"""
Find all pdb files recursivly in a given path and return a list of Dataset objects
"""
pdb_files = []
with os.scandir(path) as entries:
for entry in entries:
if entry.is_file() and entry.name.endswith('.pdb'):
pdb_files.append(Dataset(entry.name, entry.path, 'pdb'))
elif entry.is_dir():
pdb_files.extend(find_all_pdb_files(entry.path))
return pdb_files
def get_pdb_file_paths_dic(path):
"""
Returns a dictionary of pdb datasets where the key is the name of the pdb files folder
E.g. {'CG2AT_2023-02-13_20-20-52': [<Dataset object at 0xa>, ...']}
"""
pdb_files_dic = {}
with os.scandir(path) as data_folders:
for data_folder in [d for d in data_folders if not d.is_file()]:
# Find all pdb files in the data folder
datasets = find_all_pdb_files(data_folder.path)
# Add parent to datasets so we know where they came from
for dataset in datasets:
dataset.parent = data_folder.name
# Add the datasets to the dictionary
pdb_files_dic[data_folder.name] = datasets
return pdb_files_dic
def get_cg_at_datasets(
path,
CG_PATTERN='CG_INPUT.pdb',
AT_PATTERN='final_cg2at_de_novo.pdb'
):
"""
Get all CG and AT datasets from a given path.
This uses the folder structure provided by chetan.
E.g data/raw/CG2AT_2023-02-13_20-20-52/
/FINAL/final_cg2at_de_novo.pdb
/INPUT/CG_INPUT.pdb
/INPUT/DOPC_Frame_....pdb
/MERGED/merged_cg2at_de_novo.pdb
...
Parameters:
path (str): The path to the data folder
CG_RELATIVE_PATH (str): The pattern to match for CG pdb files
AT_RELATIVE_PATH (str): The pattern to match for AT pdb files
Returns:
cg_datasets (list): A list of CG datasets
at_datasets (list): A list of AT datasets
"""
all_pdb_files_dic = get_pdb_file_paths_dic(path)
cg_datasets = []
at_datasets = []
for key in all_pdb_files_dic.keys():
for dataset in all_pdb_files_dic[key]:
if dataset.path.endswith(CG_PATTERN):
cg_datasets.append(dataset)
elif dataset.path.endswith(AT_PATTERN):
at_datasets.append(dataset)
return cg_datasets, at_datasets
def get_structure_from_dataset(dataset):
"""
Returns a Bio.PDB.Structure object from a given dataset
"""
parser = PDBParser()
return parser.get_structure(dataset.name, dataset.path)
class ResidueSelector(Select):
def __init__(self, target_id):
self.target_id = target_id
def accept_residue(self, residue):
# TODO: improve this
return residue._id[1] == self.target_id
def generate_training_data(path_to_raw_data, output_dir_path):
# Get all CG and AT datasets (this is only indexing the data, not loading it)
cg_datasets, at_datasets = get_cg_at_datasets(path_to_raw_data)
io = PDBIO()
idx = 0
# Loop over both at the same time (these are generators, so they are not loaded into memory immediately)
for i, (cg_dataset, at_dataset) in enumerate(zip(cg_datasets, at_datasets)):
for j, (cg_residue, at_residue) in enumerate(zip(cg_dataset.get_residues(), at_dataset.get_residues())):
# Create folder for the idx
if not os.path.exists(f"{output_dir_path}/{idx}"):
os.makedirs(f"{output_dir_path}/{idx}")
io.set_structure(cg_dataset.get_structure())
io.save(f"{output_dir_path}/{idx}/cg.pdb",
ResidueSelector(cg_residue._id[1]), preserve_atom_numbering=True)
io.set_structure(at_dataset.get_structure())
io.save(f"{output_dir_path}/{idx}/at.pdb",
ResidueSelector(at_residue._id[1]), preserve_atom_numbering=True)
idx += 1
if idx % 10 == 0:
timestamp = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
print(f"[{timestamp}] Generated {idx} training examples")
def pdb_data_to_xyz(batch_size, idx, input_dir_path, input_size, output_size):
"""
Converts a pdb file to a xyz file
"""
# Initialize Batch
X = np.empty((batch_size, *input_size))
Y = np.empty((batch_size, *output_size))
# Load the two files in the idx folder
for i in range(batch_size):
# Get the index of the residue
residue_idx = idx * batch_size + i
# Get the path to the files
cg_path = f"{input_dir_path}/{residue_idx}/cg.pdb"
at_path = f"{input_dir_path}/{residue_idx}/at.pdb"
# Load the files
parser = PDBParser(QUIET=True)
cg_structure = parser.get_structure(residue_idx, cg_path)
at_structure = parser.get_structure(residue_idx, at_path)
print(cg_structure, at_structure)
# Get the residues
cg_residue = list(cg_structure.get_residues())[0]
at_residue = list(at_structure.get_residues())[0]
# Get the atoms
cg_atoms = list(cg_residue.get_atoms())
at_atoms = list(at_residue.get_atoms())
# Make a 200x200x200 box for coordinates
X_MAX = 200
Y_MAX = 200
Z_MAX = 200
# Make the cg data (batchsize, 12, 8)
for j, bead in enumerate(cg_atoms):
# Get coordinates
x, y, z = bead.get_coord()
# Make the coordinates relative to the box
X[i, j, 0] = x / X_MAX
X[i, j, 1] = y / Y_MAX
X[i, j, 2] = z / Z_MAX
# Make one hot encoding for the bead type
X[i, j, 3:8] = 0
bead_type_id = DOPC_BEAD_TYPE_NAME_IDS[DOPC_CG_NAME_TO_TYPE_MAP[bead.get_name(
)]]
X[i, j, 3 + bead_type_id] = 1
# Make the at data (batchsize, 138, 8)
for j, atom in enumerate(at_atoms):
# Get coordinates
x, y, z = atom.get_coord()
# Make the coordinates relative to the box
Y[i, j, 0] = x / X_MAX
Y[i, j, 1] = y / Y_MAX
Y[i, j, 2] = z / Z_MAX
# Make one hot encoding for the bead type
Y[i, j, 3:8] = 0
at_type_id = DOPC_ELEMENT_TYPE_NAME_IDS[atom.element]
Y[i, j, 3 + at_type_id] = 1
# Convert to tensor
X = tf.convert_to_tensor(X, dtype=tf.float32)
Y = tf.convert_to_tensor(Y, dtype=tf.float32)
return X, Y
def cg_xyz_to_pdb_data(X, output_dir_path):
"""
Converts a xyz file to a pdb file
"""
batch_size = X.shape[0]
# Make the output directory
if not os.path.exists(output_dir_path):
os.makedirs(output_dir_path)
# Loop over the batch
for i in range(batch_size):
# Make a 200x200x200 box for coordinates
X_MAX = 200
Y_MAX = 200
Z_MAX = 200
# Get the relative coordinates
coords = X[i, :, :3]
# Get bead_type index
bead_type = X[i, :, 3:]
bead_type_idx = np.argmax(bead_type, axis=1)
# Make the structure
residue_id = (' ', i, ' ')
residue_name = 'DOP'
residue = Residue.Residue(residue_id, residue_name, segid='')
# Add the atoms
for j, coord in enumerate(coords):
# Recover the original coordinates
x = float(coord[0] * X_MAX)
y = float(coord[1] * Y_MAX)
z = float(coord[2] * Z_MAX)
# Convert to number with 5 total decimals including the sign and decimals before the point
# x = "{:2.3f}".format(x)
# y = "{:2.3f}".format(y)
# z = "{:2.3f}".format(z)
# Find the bead type in dict name -> id
bead_name = [
"NC3",
"PO4",
"GL1",
"GL2",
"C1A",
"D2A",
"C3A",
"C4A",
"C1B",
"D2B",
"C3B",
"C4B",
][ j ]
name_to_ele = {
"NC3": "N",
"PO4": "P",
"GL1": "X",
"GL2": "X",
"C1A": "C",
"D2A": "D",
"C3A": "C",
"C4A": "C",
"C1B": "C",
"D2B": "D",
"C3B": "C",
"C4B": "C",
}
element = name_to_ele[bead_name]
# Add the atom
atom = Atom.Atom(
name=bead_name,
coord=(x, y, z),
bfactor=0.0,
occupancy=1.0,
altloc=" ",
fullname=f" {bead_name} ",
serial_number=j,
element=element
)
residue.add(atom)
io = PDBIO()
io.set_structure(residue)
io.save(f"{output_dir_path}/cg_{i}.pdb", preserve_atom_numbering=True)
def at_xyz_to_pdb_data(X, output_dir_path):
"""
Converts a xyz file to a pdb file
"""
batch_size = X.shape[0]
# Make the output directory
if not os.path.exists(output_dir_path):
os.makedirs(output_dir_path)
# Loop over the batch
for i in range(batch_size):
# Make a 200x200x200 box for coordinates
X_MAX = 200
Y_MAX = 200
Z_MAX = 200
# Get the relative coordinates
coords = X[i, :, :3]
# Get bead_type index
bead_type = X[i, :, 3:]
bead_type_idx = np.argmax(bead_type, axis=1)
# Make the structure
residue_id = (' ', i, ' ')
residue_name = 'DOP'
residue = Residue.Residue(residue_id, residue_name, segid='')
# Add the atoms
for j, coord in enumerate(coords):
# Recover the original coordinates
x = float(coord[0] * X_MAX)
y = float(coord[1] * Y_MAX)
z = float(coord[2] * Z_MAX)
# Convert to number with 5 total decimals including the sign and decimals before the point
# x = "{:2.3f}".format(x)
# y = "{:2.3f}".format(y)
# z = "{:2.3f}".format(z)
# Find the bead type in dict name -> id
bead_name = [
" N ",
" C12",
"H12A",
"H12B",
" C13",
"H13A",
"H13B",
"H13C",
" C14",
"H14A",
"H14B",
"H14C",
" C15",
"H15A",
"H15B",
"H15C",
" C11",
"H11A",
"H11B",
" P ",
" O13",
" O14",
" O12",
" O11",
" C1 ",
" HA ",
" HB ",
" C2 ",
" HS ",
" O21",
" C21",
" O22",
" C22",
" H2R",
" H2S",
" C3 ",
" HX ",
" HY ",
" O31",
" C31",
" O32",
" C32",
" H2X",
" H2Y",
" C23",
" H3R",
" H3S",
" C24",
" H4R",
" H4S",
" C25",
" H5R",
" H5S",
" C26",
" H6R",
" H6S",
" C27",
" H7R",
" H7S",
" C28",
" H8R",
" H8S",
" C29",
" H9R",
"C210",
"H10R",
"C211",
"H11R",
"H11S",
"C212",
"H12R",
"H12S",
"C213",
"H13R",
"H13S",
"C214",
"H14R",
"H14S",
"C215",
"H15R",
"H15S",
"C216",
"H16R",
"H16S",
"C217",
"H17R",
"H17S",
"C218",
"H18R",
"H18S",
"H18T",
" C33",
" H3X",
" H3Y",
" C34",
" H4X",
" H4Y",
" C35",
" H5X",
" H5Y",
" C36",
" H6X",
" H6Y",
" C37",
" H7X",
" H7Y",
" C38",
" H8X",
" H8Y",
" C39",
" H9X",
"C310",
"H10X",
"C311",
"H11X",
"H11Y",
"C312",
"H12X",
"H12Y",
"C313",
"H13X",
"H13Y",
"C314",
"H14X",
"H14Y",
"C315",
"H15X",
"H15Y",
"C316",
"H16X",
"H16Y",
"C317",
"H17X",
"H17Y",
"C318",
"H18X",
"H18Y",
"H18Z",
][ j ]
name_to_ele = {
" N ": "N",
" C12": "C",
"H12A": "H",
"H12B": "H",
" C13": "C",
"H13A": "H",
"H13B": "H",
"H13C": "H",
" C14": "C",
"H14A": "H",
"H14B": "H",
"H14C": "H",
" C15": "C",
"H15A": "H",
"H15B": "H",
"H15C": "H",
" C11": "C",
"H11A": "H",
"H11B": "H",
" P ": "P",
" O13": "O",
" O14": "O",
" O12": "O",
" O11": "O",
" C1 ": "C",
" HA ": "H",
" HB ": "H",
" C2 ": "C",
" HS ": "H",
" O21": "O",
" C21": "C",
" O22": "O",
" C22": "C",
" H2R": "H",
" H2S": "H",
" C3 ": "C",
" HX ": "H",
" HY ": "H",
" O31": "O",
" C31": "C",
" O32": "O",
" C32": "C",
" H2X": "H",
" H2Y": "H",
" C23": "C",
" H3R": "H",
" H3S": "H",
" C24": "C",
" H4R": "H",
" H4S": "H",
" C25": "C",
" H5R": "H",
" H5S": "H",
" C26": "C",
" H6R": "H",
" H6S": "H",
" C27": "C",
" H7R": "H",
" H7S": "H",
" C28": "C",
" H8R": "H",
" H8S": "H",
" C29": "C",
" H9R": "H",
"C210": "C",
"H10R": "H",
"C211": "C",
"H11R": "H",
"H11S": "H",
"C212": "C",
"H12R": "H",
"H12S": "H",
"C213": "C",
"H13R": "H",
"H13S": "H",
"C214": "C",
"H14R": "H",
"H14S": "H",
"C215": "C",
"H15R": "H",
"H15S": "H",
"C216": "C",
"H16R": "H",
"H16S": "H",
"C217": "C",
"H17R": "H",
"H17S": "H",
"C218": "C",
"H18R": "H",
"H18S": "H",
"H18T": "H",
" C33": "C",
" H3X": "H",
" H3Y": "H",
" C34": "C",
" H4X": "H",
" H4Y": "H",
" C35": "C",
" H5X": "H",
" H5Y": "H",
" C36": "C",
" H6X": "H",
" H6Y": "H",
" C37": "C",
" H7X": "H",
" H7Y": "H",
" C38": "C",
" H8X": "H",
" H8Y": "H",
" C39": "C",
" H9X": "H",
"C310": "C",
"H10X": "H",
"C311": "C",
"H11X": "H",
"H11Y": "H",
"C312": "C",
"H12X": "H",
"H12Y": "H",
"C313": "C",
"H13X": "H",
"H13Y": "H",
"C314": "C",
"H14X": "H",
"H14Y": "H",
"C315": "C",
"H15X": "H",
"H15Y": "H",
"C316": "C",
"H16X": "H",
"H16Y": "H",
"C317": "C",
"H17X": "H",
"H17Y": "H",
"C318": "C",
"H18X": "H",
"H18Y": "H",
"H18Z": "H",
}
element = name_to_ele[bead_name]
# Add the atom
atom = Atom.Atom(
name=bead_name,
coord=(x, y, z),
bfactor=0.0,
occupancy=1.0,
altloc=" ",
fullname=f" {bead_name} ",
serial_number=j,
element=element
)
residue.add(atom)
io = PDBIO()
io.set_structure(residue)
io.save(f"{output_dir_path}/at_{i}.pdb", preserve_atom_numbering=True)
Functions
def at_xyz_to_pdb_data(X, output_dir_path)
-
Converts a xyz file to a pdb file
Expand source code
def at_xyz_to_pdb_data(X, output_dir_path): """ Converts a xyz file to a pdb file """ batch_size = X.shape[0] # Make the output directory if not os.path.exists(output_dir_path): os.makedirs(output_dir_path) # Loop over the batch for i in range(batch_size): # Make a 200x200x200 box for coordinates X_MAX = 200 Y_MAX = 200 Z_MAX = 200 # Get the relative coordinates coords = X[i, :, :3] # Get bead_type index bead_type = X[i, :, 3:] bead_type_idx = np.argmax(bead_type, axis=1) # Make the structure residue_id = (' ', i, ' ') residue_name = 'DOP' residue = Residue.Residue(residue_id, residue_name, segid='') # Add the atoms for j, coord in enumerate(coords): # Recover the original coordinates x = float(coord[0] * X_MAX) y = float(coord[1] * Y_MAX) z = float(coord[2] * Z_MAX) # Convert to number with 5 total decimals including the sign and decimals before the point # x = "{:2.3f}".format(x) # y = "{:2.3f}".format(y) # z = "{:2.3f}".format(z) # Find the bead type in dict name -> id bead_name = [ " N ", " C12", "H12A", "H12B", " C13", "H13A", "H13B", "H13C", " C14", "H14A", "H14B", "H14C", " C15", "H15A", "H15B", "H15C", " C11", "H11A", "H11B", " P ", " O13", " O14", " O12", " O11", " C1 ", " HA ", " HB ", " C2 ", " HS ", " O21", " C21", " O22", " C22", " H2R", " H2S", " C3 ", " HX ", " HY ", " O31", " C31", " O32", " C32", " H2X", " H2Y", " C23", " H3R", " H3S", " C24", " H4R", " H4S", " C25", " H5R", " H5S", " C26", " H6R", " H6S", " C27", " H7R", " H7S", " C28", " H8R", " H8S", " C29", " H9R", "C210", "H10R", "C211", "H11R", "H11S", "C212", "H12R", "H12S", "C213", "H13R", "H13S", "C214", "H14R", "H14S", "C215", "H15R", "H15S", "C216", "H16R", "H16S", "C217", "H17R", "H17S", "C218", "H18R", "H18S", "H18T", " C33", " H3X", " H3Y", " C34", " H4X", " H4Y", " C35", " H5X", " H5Y", " C36", " H6X", " H6Y", " C37", " H7X", " H7Y", " C38", " H8X", " H8Y", " C39", " H9X", "C310", "H10X", "C311", "H11X", "H11Y", "C312", "H12X", "H12Y", "C313", "H13X", "H13Y", "C314", "H14X", "H14Y", "C315", "H15X", "H15Y", "C316", "H16X", "H16Y", "C317", "H17X", "H17Y", "C318", "H18X", "H18Y", "H18Z", ][ j ] name_to_ele = { " N ": "N", " C12": "C", "H12A": "H", "H12B": "H", " C13": "C", "H13A": "H", "H13B": "H", "H13C": "H", " C14": "C", "H14A": "H", "H14B": "H", "H14C": "H", " C15": "C", "H15A": "H", "H15B": "H", "H15C": "H", " C11": "C", "H11A": "H", "H11B": "H", " P ": "P", " O13": "O", " O14": "O", " O12": "O", " O11": "O", " C1 ": "C", " HA ": "H", " HB ": "H", " C2 ": "C", " HS ": "H", " O21": "O", " C21": "C", " O22": "O", " C22": "C", " H2R": "H", " H2S": "H", " C3 ": "C", " HX ": "H", " HY ": "H", " O31": "O", " C31": "C", " O32": "O", " C32": "C", " H2X": "H", " H2Y": "H", " C23": "C", " H3R": "H", " H3S": "H", " C24": "C", " H4R": "H", " H4S": "H", " C25": "C", " H5R": "H", " H5S": "H", " C26": "C", " H6R": "H", " H6S": "H", " C27": "C", " H7R": "H", " H7S": "H", " C28": "C", " H8R": "H", " H8S": "H", " C29": "C", " H9R": "H", "C210": "C", "H10R": "H", "C211": "C", "H11R": "H", "H11S": "H", "C212": "C", "H12R": "H", "H12S": "H", "C213": "C", "H13R": "H", "H13S": "H", "C214": "C", "H14R": "H", "H14S": "H", "C215": "C", "H15R": "H", "H15S": "H", "C216": "C", "H16R": "H", "H16S": "H", "C217": "C", "H17R": "H", "H17S": "H", "C218": "C", "H18R": "H", "H18S": "H", "H18T": "H", " C33": "C", " H3X": "H", " H3Y": "H", " C34": "C", " H4X": "H", " H4Y": "H", " C35": "C", " H5X": "H", " H5Y": "H", " C36": "C", " H6X": "H", " H6Y": "H", " C37": "C", " H7X": "H", " H7Y": "H", " C38": "C", " H8X": "H", " H8Y": "H", " C39": "C", " H9X": "H", "C310": "C", "H10X": "H", "C311": "C", "H11X": "H", "H11Y": "H", "C312": "C", "H12X": "H", "H12Y": "H", "C313": "C", "H13X": "H", "H13Y": "H", "C314": "C", "H14X": "H", "H14Y": "H", "C315": "C", "H15X": "H", "H15Y": "H", "C316": "C", "H16X": "H", "H16Y": "H", "C317": "C", "H17X": "H", "H17Y": "H", "C318": "C", "H18X": "H", "H18Y": "H", "H18Z": "H", } element = name_to_ele[bead_name] # Add the atom atom = Atom.Atom( name=bead_name, coord=(x, y, z), bfactor=0.0, occupancy=1.0, altloc=" ", fullname=f" {bead_name} ", serial_number=j, element=element ) residue.add(atom) io = PDBIO() io.set_structure(residue) io.save(f"{output_dir_path}/at_{i}.pdb", preserve_atom_numbering=True)
def cg_xyz_to_pdb_data(X, output_dir_path)
-
Converts a xyz file to a pdb file
Expand source code
def cg_xyz_to_pdb_data(X, output_dir_path): """ Converts a xyz file to a pdb file """ batch_size = X.shape[0] # Make the output directory if not os.path.exists(output_dir_path): os.makedirs(output_dir_path) # Loop over the batch for i in range(batch_size): # Make a 200x200x200 box for coordinates X_MAX = 200 Y_MAX = 200 Z_MAX = 200 # Get the relative coordinates coords = X[i, :, :3] # Get bead_type index bead_type = X[i, :, 3:] bead_type_idx = np.argmax(bead_type, axis=1) # Make the structure residue_id = (' ', i, ' ') residue_name = 'DOP' residue = Residue.Residue(residue_id, residue_name, segid='') # Add the atoms for j, coord in enumerate(coords): # Recover the original coordinates x = float(coord[0] * X_MAX) y = float(coord[1] * Y_MAX) z = float(coord[2] * Z_MAX) # Convert to number with 5 total decimals including the sign and decimals before the point # x = "{:2.3f}".format(x) # y = "{:2.3f}".format(y) # z = "{:2.3f}".format(z) # Find the bead type in dict name -> id bead_name = [ "NC3", "PO4", "GL1", "GL2", "C1A", "D2A", "C3A", "C4A", "C1B", "D2B", "C3B", "C4B", ][ j ] name_to_ele = { "NC3": "N", "PO4": "P", "GL1": "X", "GL2": "X", "C1A": "C", "D2A": "D", "C3A": "C", "C4A": "C", "C1B": "C", "D2B": "D", "C3B": "C", "C4B": "C", } element = name_to_ele[bead_name] # Add the atom atom = Atom.Atom( name=bead_name, coord=(x, y, z), bfactor=0.0, occupancy=1.0, altloc=" ", fullname=f" {bead_name} ", serial_number=j, element=element ) residue.add(atom) io = PDBIO() io.set_structure(residue) io.save(f"{output_dir_path}/cg_{i}.pdb", preserve_atom_numbering=True)
def find_all_pdb_files(path)
-
Find all pdb files recursivly in a given path and return a list of Dataset objects
Expand source code
def find_all_pdb_files(path): """ Find all pdb files recursivly in a given path and return a list of Dataset objects """ pdb_files = [] with os.scandir(path) as entries: for entry in entries: if entry.is_file() and entry.name.endswith('.pdb'): pdb_files.append(Dataset(entry.name, entry.path, 'pdb')) elif entry.is_dir(): pdb_files.extend(find_all_pdb_files(entry.path)) return pdb_files
def generate_training_data(path_to_raw_data, output_dir_path)
-
Expand source code
def generate_training_data(path_to_raw_data, output_dir_path): # Get all CG and AT datasets (this is only indexing the data, not loading it) cg_datasets, at_datasets = get_cg_at_datasets(path_to_raw_data) io = PDBIO() idx = 0 # Loop over both at the same time (these are generators, so they are not loaded into memory immediately) for i, (cg_dataset, at_dataset) in enumerate(zip(cg_datasets, at_datasets)): for j, (cg_residue, at_residue) in enumerate(zip(cg_dataset.get_residues(), at_dataset.get_residues())): # Create folder for the idx if not os.path.exists(f"{output_dir_path}/{idx}"): os.makedirs(f"{output_dir_path}/{idx}") io.set_structure(cg_dataset.get_structure()) io.save(f"{output_dir_path}/{idx}/cg.pdb", ResidueSelector(cg_residue._id[1]), preserve_atom_numbering=True) io.set_structure(at_dataset.get_structure()) io.save(f"{output_dir_path}/{idx}/at.pdb", ResidueSelector(at_residue._id[1]), preserve_atom_numbering=True) idx += 1 if idx % 10 == 0: timestamp = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S') print(f"[{timestamp}] Generated {idx} training examples")
def get_cg_at_datasets(path, CG_PATTERN='CG_INPUT.pdb', AT_PATTERN='final_cg2at_de_novo.pdb')
-
Get all CG and AT datasets from a given path. This uses the folder structure provided by chetan. E.g data/raw/CG2AT_2023-02-13_20-20-52/ /FINAL/final_cg2at_de_novo.pdb /INPUT/CG_INPUT.pdb /INPUT/DOPC_Frame_....pdb /MERGED/merged_cg2at_de_novo.pdb …
Parameters
path (str): The path to the data folder CG_RELATIVE_PATH (str): The pattern to match for CG pdb files AT_RELATIVE_PATH (str): The pattern to match for AT pdb files
Returns
cg_datasets (list): A list of CG datasets at_datasets (list): A list of AT datasets
Expand source code
def get_cg_at_datasets( path, CG_PATTERN='CG_INPUT.pdb', AT_PATTERN='final_cg2at_de_novo.pdb' ): """ Get all CG and AT datasets from a given path. This uses the folder structure provided by chetan. E.g data/raw/CG2AT_2023-02-13_20-20-52/ /FINAL/final_cg2at_de_novo.pdb /INPUT/CG_INPUT.pdb /INPUT/DOPC_Frame_....pdb /MERGED/merged_cg2at_de_novo.pdb ... Parameters: path (str): The path to the data folder CG_RELATIVE_PATH (str): The pattern to match for CG pdb files AT_RELATIVE_PATH (str): The pattern to match for AT pdb files Returns: cg_datasets (list): A list of CG datasets at_datasets (list): A list of AT datasets """ all_pdb_files_dic = get_pdb_file_paths_dic(path) cg_datasets = [] at_datasets = [] for key in all_pdb_files_dic.keys(): for dataset in all_pdb_files_dic[key]: if dataset.path.endswith(CG_PATTERN): cg_datasets.append(dataset) elif dataset.path.endswith(AT_PATTERN): at_datasets.append(dataset) return cg_datasets, at_datasets
def get_pdb_file_paths_dic(path)
-
Returns a dictionary of pdb datasets where the key is the name of the pdb files folder E.g. {'CG2AT_2023-02-13_20-20-52': [
, …']} Expand source code
def get_pdb_file_paths_dic(path): """ Returns a dictionary of pdb datasets where the key is the name of the pdb files folder E.g. {'CG2AT_2023-02-13_20-20-52': [<Dataset object at 0xa>, ...']} """ pdb_files_dic = {} with os.scandir(path) as data_folders: for data_folder in [d for d in data_folders if not d.is_file()]: # Find all pdb files in the data folder datasets = find_all_pdb_files(data_folder.path) # Add parent to datasets so we know where they came from for dataset in datasets: dataset.parent = data_folder.name # Add the datasets to the dictionary pdb_files_dic[data_folder.name] = datasets return pdb_files_dic
def get_structure_from_dataset(dataset)
-
Returns a Bio.PDB.Structure object from a given dataset
Expand source code
def get_structure_from_dataset(dataset): """ Returns a Bio.PDB.Structure object from a given dataset """ parser = PDBParser() return parser.get_structure(dataset.name, dataset.path)
def pdb_data_to_xyz(batch_size, idx, input_dir_path, input_size, output_size)
-
Converts a pdb file to a xyz file
Expand source code
def pdb_data_to_xyz(batch_size, idx, input_dir_path, input_size, output_size): """ Converts a pdb file to a xyz file """ # Initialize Batch X = np.empty((batch_size, *input_size)) Y = np.empty((batch_size, *output_size)) # Load the two files in the idx folder for i in range(batch_size): # Get the index of the residue residue_idx = idx * batch_size + i # Get the path to the files cg_path = f"{input_dir_path}/{residue_idx}/cg.pdb" at_path = f"{input_dir_path}/{residue_idx}/at.pdb" # Load the files parser = PDBParser(QUIET=True) cg_structure = parser.get_structure(residue_idx, cg_path) at_structure = parser.get_structure(residue_idx, at_path) print(cg_structure, at_structure) # Get the residues cg_residue = list(cg_structure.get_residues())[0] at_residue = list(at_structure.get_residues())[0] # Get the atoms cg_atoms = list(cg_residue.get_atoms()) at_atoms = list(at_residue.get_atoms()) # Make a 200x200x200 box for coordinates X_MAX = 200 Y_MAX = 200 Z_MAX = 200 # Make the cg data (batchsize, 12, 8) for j, bead in enumerate(cg_atoms): # Get coordinates x, y, z = bead.get_coord() # Make the coordinates relative to the box X[i, j, 0] = x / X_MAX X[i, j, 1] = y / Y_MAX X[i, j, 2] = z / Z_MAX # Make one hot encoding for the bead type X[i, j, 3:8] = 0 bead_type_id = DOPC_BEAD_TYPE_NAME_IDS[DOPC_CG_NAME_TO_TYPE_MAP[bead.get_name( )]] X[i, j, 3 + bead_type_id] = 1 # Make the at data (batchsize, 138, 8) for j, atom in enumerate(at_atoms): # Get coordinates x, y, z = atom.get_coord() # Make the coordinates relative to the box Y[i, j, 0] = x / X_MAX Y[i, j, 1] = y / Y_MAX Y[i, j, 2] = z / Z_MAX # Make one hot encoding for the bead type Y[i, j, 3:8] = 0 at_type_id = DOPC_ELEMENT_TYPE_NAME_IDS[atom.element] Y[i, j, 3 + at_type_id] = 1 # Convert to tensor X = tf.convert_to_tensor(X, dtype=tf.float32) Y = tf.convert_to_tensor(Y, dtype=tf.float32) return X, Y
Classes
class ResidueSelector (target_id)
-
Select everything for PDB output (for use as a base class).
Default selection (everything) during writing - can be used as base class to implement selective output. This selects which entities will be written out.
Expand source code
class ResidueSelector(Select): def __init__(self, target_id): self.target_id = target_id def accept_residue(self, residue): # TODO: improve this return residue._id[1] == self.target_id
Ancestors
- Bio.PDB.PDBIO.Select
Methods
def accept_residue(self, residue)
-
Overload this to reject residues for output.
Expand source code
def accept_residue(self, residue): # TODO: improve this return residue._id[1] == self.target_id