Datasets

In this analysis, we compare Eisenberg's dataset of interacting APRs (labeled as Template structures) and our own created dataset of hexapeptide pairs (extracted from known amyloid proteins in the PDB database and repaired with FoldX). Eisenberg's dataset contains 181 structures, our dataset ~tens of thousands.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import norm
from Bio.PDB import PDBParser, PPBuilder

import math
import os

# RSA #
from Bio.PDB import PDBParser
from Bio.Data.IUPACData import protein_letters_3to1
from Bio.PDB.Polypeptide import is_aa
from Bio.PDB import SASA
from Bio.PDB.SASA import ShrakeRupley
from collections import defaultdict

from concurrent.futures import ProcessPoolExecutor, as_completed
from Bio.PDB import PDBParser, PDBExceptions
#######

# Shape Analysis #
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering

from sklearn.decomposition import PCA
import umap

from skbio.stats.ordination import pcoa
from skbio import DistanceMatrix
##################

# Trees #
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import squareform
#########


sns.set_theme(style="whitegrid")

Some structures needed to be standardized before feeding them in FoldX's Repair command. We load our datasets and remove non-standardized structures (which have a standardized equivalent).

# Get rid of non-standardized files if standardized exists

def filter_non_standardized(df):
    df = df.copy()
    df['base_id'] = df['PDB_File'].apply(get_base_id)

    # Find which base IDs have both versions
    has_standardized = set(
        df[df['PDB_File'].str.endswith('_standardized_Repair.pdb')]['base_id']
    )

    is_standardized = set(
        df[df['PDB_File'].str.endswith('_standardized_Repair.pdb')]['PDB_File']
    )

    # Keep:
    # - all standardized files
    # - non-standardized files only if no standardized version exists
    filtered_df = df[
        (
            ~df['PDB_File'].str.endswith('_Repair.pdb') | 
            (df['PDB_File'].str.endswith('_Repair.pdb') & ~df['base_id'].isin(has_standardized)) |
            df['PDB_File'].isin(is_standardized)
        )
    ]

    # Drop helper column
    filtered_df = filtered_df.drop(columns=['base_id'])
    
    return filtered_df

# Extract base ID (before _Repair.pdb or _standardized_Repair.pdb)
def get_base_id(filename):
    if filename.endswith('_standardized_Repair.pdb'):
        return filename.replace('_standardized_Repair.pdb', '')
    elif filename.endswith('_Repair.pdb'):
        return filename.replace('_Repair.pdb', '')
    else:
        return filename
# Stability datasets
print("Reading stability datasets...")

## Read template (Eisenberg's) dataset
template_df = pd.read_csv("./results/foldx_stability_results_templates.csv")
## Read combined stability results for unrepaired and repaired fragment pairs and split them
combined_stability_df = pd.read_csv("./results/foldx_stability_results_fragment_pairs.csv")
extracted_df = combined_stability_df[~combined_stability_df['PDB_File'].str.endswith('Repair.pdb')]
repaired_df = combined_stability_df[combined_stability_df['PDB_File'].str.endswith('Repair.pdb')]
print("Loaded\n")

## Remove unwanted elements
print(f"Removing unwanted elements from the datasets...")
print(f"Extracted: {len(extracted_df[extracted_df['PDB_File'].str.endswith('standardized.pdb')])} standardized extracts out of {len(extracted_df)}")
print(f"Repaired: {len(repaired_df[repaired_df['PDB_File'].str.endswith('standardized_Repair.pdb')])} standardized repairs out of {len(repaired_df)}")
extracted_df = extracted_df[~extracted_df['PDB_File'].str.endswith('standardized.pdb')] # Remove standardized unrepaired duplicates
repaired_df = filter_non_standardized(repaired_df) # remove Repairs with standardized duplicates
print(f"Extracted: {len(extracted_df)} dihexapeptide fragment pairs after initial filtering")
print(f"Repaired: {len(repaired_df)} dihexapeptide fragment pairs after initial filtering")
Reading stability datasets...
Loaded

Removing unwanted elements from the datasets...
Extracted: 1508 standardized extracts out of 193431
Repaired: 1508 standardized repairs out of 193431
Extracted: 191923 dihexapeptide fragment pairs after initial filtering
Repaired: 191923 dihexapeptide fragment pairs after initial filtering

We are going to be working with repaired_df (or its subsample) which is our extracted dataset after repair.

# Repaired dataset's info
repaired_df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 191923 entries, 1 to 386861
Data columns (total 8 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   PDB_File               191923 non-null  object 
 1   Total_Energy           191923 non-null  float64
 2   Backbone_HBond         191923 non-null  float64
 3   Sidechain_HBond        191923 non-null  float64
 4   Van_der_Waals          191923 non-null  float64
 5   Electrostatics         191923 non-null  float64
 6   Solvation_Polar        191923 non-null  float64
 7   Solvation_Hydrophobic  191923 non-null  float64
dtypes: float64(7), object(1)
memory usage: 13.2+ MB

Here's a quick summary of all the three dataset's (including estimation of total energy):

# Print how many unique PDB ids are in each dataframe
pdb_ids_t = template_df["PDB_File"].str[:4].unique()
pdb_ids_e = extracted_df["PDB_File"].str[:4].unique()
pdb_ids_r = repaired_df["PDB_File"].str[:4].unique()

K = pd.DataFrame({
    "Dataframe": ["Template", "Extracted", "Repaired"],
    "Unique PDB IDs": [len(pdb_ids_t), len(pdb_ids_e), len(pdb_ids_r)],
    "Total Entries": [len(template_df), len(extracted_df), len(repaired_df)],
    "Mean Total Energy": [template_df["Total_Energy"].mean(), extracted_df["Total_Energy"].mean(), repaired_df["Total_Energy"].mean()],
    "Median Total Energy": [template_df["Total_Energy"].median(), extracted_df["Total_Energy"].median(), repaired_df["Total_Energy"].median()],
    "Max Total Energy": [template_df["Total_Energy"].max(), extracted_df["Total_Energy"].max(), repaired_df["Total_Energy"].max()],
    "Min Total Energy": [template_df["Total_Energy"].min(), extracted_df["Total_Energy"].min(), repaired_df["Total_Energy"].min()],
})

K.head()
Dataframe Unique PDB IDs Total Entries Mean Total Energy Median Total Energy Max Total Energy Min Total Energy
0 Template 80 181 12.377997 14.6891 57.066 -44.0693
1 Extracted 500 191923 49.973542 48.2727 538.420 -30.4614
2 Repaired 500 191923 30.177946 29.5613 270.050 -44.7015

During extraction, information about each structure was written down in a protein_properties csv. Here are few sanity checks if our pipeline worked as expected.

# Protein properties dataset
print("Loading protein properties dataset...")

protein_properties = pd.read_csv("./results/protein_data_pdb_amyloid_structures.csv")
protein_properties['pdb_file'] = protein_properties['pdb_file'].str[:4]
protein_properties = protein_properties.rename(columns={"pdb_file": "PDB ID"})
print("Loaded\n")

## Simple qualities
print(f"We have {len(protein_properties[protein_properties['num_of_pairs'] == 0])}/{len(protein_properties)} proteins with zero pairs.")
print(f"{len(protein_properties[protein_properties['avg_num_of_layers'] != 5])} proteins have an average number of layers different from 5.\n")

## Correction info
corrected_succ = protein_properties[(protein_properties['num_of_layers_before'] != 5) & (protein_properties['avg_num_of_layers'] == 5)]['PDB ID'].unique()
corrected_unsucc = protein_properties[protein_properties['avg_num_of_layers'] != 5]['PDB ID'].unique()
non_five_layer_proteins = protein_properties[protein_properties['avg_num_of_layers'] != 5]['PDB ID']
num_pairs_non_five = len(repaired_df[(repaired_df['PDB_File'].str[:4]).isin(non_five_layer_proteins)])
print(f"{len(corrected_succ)+len(corrected_unsucc)} proteins needed correction, {len(corrected_succ)} were successful, {len(corrected_unsucc)} were not!")
print(f"{len(repaired_df[(repaired_df['PDB_File'].str[:4]).isin(non_five_layer_proteins)])}/{len(repaired_df)} fragments come from NON-5-layered structs!")
print(f"{len(protein_properties[(protein_properties['avg_num_of_layers'] == 5) & (protein_properties['num_of_pairs'] > 0)])} 5-layered proteins contributed with {len(repaired_df) - num_pairs_non_five}/{len(repaired_df)} pairs.")
Loading protein properties dataset...
Loaded

We have 2/502 proteins with zero pairs.
8 proteins have an average number of layers different from 5.

344 proteins needed correction, 336 were successful, 8 were not!
12592/191923 fragments come from NON-5-layered structs!
492 5-layered proteins contributed with 179331/191923 pairs.

As we can see, few of our structures failed the layer correction. Some of them still contributed to our datasets. We will remove them.

# Protein properties dataset's info
protein_properties.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 502 entries, 0 to 501
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   PDB ID                 502 non-null    object 
 1   num_of_polypeptides    502 non-null    int64  
 2   num_of_windows         502 non-null    int64  
 3   num_of_stacks          502 non-null    int64  
 4   avg_num_of_layers      502 non-null    float64
 5   num_of_pairs           502 non-null    int64  
 6   num_of_pps_before      502 non-null    int64  
 7   num_of_stacks_before   502 non-null    int64  
 8   num_of_layers_before   502 non-null    float64
 9   num_of_shorter_layers  502 non-null    int64  
 10  antiparallel_count     502 non-null    int64  
dtypes: float64(2), int64(8), object(1)
memory usage: 43.3+ KB
# Remove fragment pairs coming from NON-5-layered structures
print("Removing every fragment from the stability dataset's which comes from NON-5-layered structures...")

repaired_df = repaired_df[~(repaired_df['PDB_File'].str[:4]).isin(non_five_layer_proteins)]
extracted_df = extracted_df[~(extracted_df['PDB_File'].str[:4]).isin(non_five_layer_proteins)]
print(f"{len(extracted_df)} is the length of the Extracted dataset after correction")
print(f"{len(repaired_df)} is the length of the Repaired dataset after correction.")
Removing every fragment from the stability dataset's which comes from NON-5-layered structures...
179331 is the length of the Extracted dataset after correction
179331 is the length of the Repaired dataset after correction.

For our repaired_df dataset to be managable during computationally-heavy analyses, we take a 20% sample.

# Create a 20% sample of the repaired dataset for certain heavy analyses
sample_repaired = repaired_df.sample(frac=0.20, random_state=42)
print(f"Sampled {len(sample_repaired)} rows from repaired_df for later analyses.")

paths = set(sample_repaired["PDB_File"].unique())
with open("pdb_file_list_test.txt", "w") as f:
    for pdb in paths:
        f.write(pdb + "\n")
# Then use:
# rsync -av --progress --partial --append-verify --files-from=pdb_file_list.txt meta:/storage/praha1/home/tobiasma/switch-lab/amyloid-interactions/fragment_pairs/ ./fragment_pairs/
Sampled 35866 rows from repaired_df for later analyses.

Our catch-all condition needs further filtering. Let's filter the structure on distance - take structures which:

  • have at least 5 residues in interaction (within one layer) <= 4.5 Angstroms
  • at least 2 residues are used on each side of the layer
# Filter the sample
stricter_paths = []
with open("fragment_pairs_filtered_five_two_sample.txt", "r") as f:
    for line in f:
        name = line.split('/')[-1]
        name = name if name[-1] != '\n' else name[:-1] 
        stricter_paths.append({"PDB_File": name})

filter_helper = pd.DataFrame(stricter_paths)
filter_helper = filter_helper[filter_helper["PDB_File"].isin(set(sample_repaired["PDB_File"]))]
sample_repaired = pd.merge(sample_repaired, filter_helper, on=["PDB_File"], how='right')
sample_repaired = sample_repaired.dropna()
print(f"Correctly filtered on a condition, real sample has {len(sample_repaired)} structures")
sample_repaired.head()
Correctly filtered on a condition, real sample has 12304 structures
PDB_File Total_Energy Backbone_HBond Sidechain_HBond Van_der_Waals Electrostatics Solvation_Polar Solvation_Hydrophobic
0 7qkj_pair_765_Repair.pdb 18.45160 -30.7581 -8.22142 -42.7342 3.79509 61.2674 -53.1292
1 8olq_pair_012_Repair.pdb 50.32270 -36.4475 -31.34510 -59.5809 11.08980 101.5600 -67.5650
2 8ci8_pair_225_Repair.pdb 12.05720 -36.2584 -6.14683 -67.4686 4.42839 89.9917 -93.9525
3 7nrq_pair_279_Repair.pdb 49.53490 -31.8512 -9.81061 -45.1977 3.33110 86.0854 -51.5352
4 7qkk_pair_121_Repair.pdb 8.61727 -46.0144 -19.36770 -57.6874 6.74647 89.7038 -71.9427
# Filter the whole repaired_df as well
stricter_paths = []
with open("fragment_pairs_filtered_five_two.txt", "r") as f:
    for line in f:
        name = line.split('/')[-1]
        name = name if name[-1] != '\n' else name[:-1] 
        stricter_paths.append({"PDB_File": name})

filter_repaired = pd.DataFrame(stricter_paths)
repaired_df = pd.merge(repaired_df, filter_repaired, on=["PDB_File"], how='right')
repaired_df = repaired_df.dropna()
print(f"Correctly filtered on a condition, repaired df has {len(repaired_df)} structures now")
repaired_df.head()
Correctly filtered on a condition, repaired df has 61951 structures now
PDB_File Total_Energy Backbone_HBond Sidechain_HBond Van_der_Waals Electrostatics Solvation_Polar Solvation_Hydrophobic
0 8v1n_pair_342_Repair.pdb 75.12660 -24.4209 -10.17090 -35.7653 9.865810 81.5012 -34.3863
1 8otg_pair_073_Repair.pdb 7.37734 -48.2354 -16.49210 -53.7371 5.152580 79.6257 -68.4023
2 7xo2_pair_150_Repair.pdb 38.10280 -29.0117 -8.14856 -39.2094 2.617550 63.2756 -48.1347
3 7xo3_pair_287_Repair.pdb 30.79310 -32.1495 -10.58240 -46.0042 -0.740443 73.4309 -57.7334
4 8ot4_pair_130_Repair.pdb 75.06350 -16.4509 -20.34710 -43.9366 10.024300 95.2448 -44.8586

Stability

The first thing to look at is if our dataset's estimation of total energy (with FoldX's Stability command) matches with the Eisenberg's

# Do a histogram of total energy for both dataframes for comparison
template_label = f"Template (n={len(template_df)})"
extracted_label = f"Extracted (n={len(extracted_df)})"
repaired_label = f"Repaired (n={len(repaired_df)})"

all_stability_data = np.concatenate([
    template_df["Total_Energy"].values,
    #extracted_df["Total_Energy"].values,
    repaired_df["Total_Energy"].values
])

# Define bin edges
bins = np.linspace(all_stability_data.min(), all_stability_data.max(), 106)

plt.figure(figsize=(10, 6))
sns.histplot(template_df["Total_Energy"], bins=bins, color="mediumorchid", label=template_label, kde=False, stat="density")
#sns.histplot(extracted_df["Total_Energy"], bins=50, color="red", label=extracted_label, kde=True, stat="density")
sns.histplot(repaired_df["Total_Energy"], bins=bins, color="cornflowerblue", label=repaired_label, kde=False, stat="density")

plt.xlim(-50, 110)
plt.xlabel("Total Energy")
plt.ylabel("Probability Density")
plt.title("Total Energy Histograms of Template and Repaired fragment pairs (Normalized)")
plt.grid(False)
plt.axvline(0, color="lightgray", linewidth=2, linestyle="-")
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_color('dimgray')
plt.gca().spines['bottom'].set_color('dimgray')
plt.gca().set_yticks([0, 0.01, 0.02])

plt.legend()
plt.savefig("plots/total_energy_histogram_fragments.png", dpi=600, bbox_inches="tight")
plt.savefig("plots/total_energy_histogram_fragments.pdf", bbox_inches="tight")
plt.savefig("plots/total_energy_histogram_fragments.svg", bbox_inches="tight")
plt.show()
# Best fragments
repaired_df.nsmallest(50, 'Total_Energy').head()
PDB_File Total_Energy Backbone_HBond Sidechain_HBond Van_der_Waals Electrostatics Solvation_Polar Solvation_Hydrophobic
4514 8pkg_pair_302_Repair.pdb -44.7015 -51.9016 -1.37386 -69.9902 4.036450 70.8653 -107.5340
7157 8q98_pair_008_Repair.pdb -39.7086 -51.9347 -18.13530 -70.7382 1.057940 76.6335 -103.1080
29008 8q98_pair_309_Repair.pdb -39.7062 -51.6920 -15.25990 -70.4398 1.018250 76.2852 -102.5010
44807 7r5h_pair_010_Repair.pdb -39.6906 -50.3670 -12.58230 -70.4432 0.979359 74.7961 -102.7270
13972 8q98_pair_305_Repair.pdb -38.9145 -52.5437 -20.16940 -66.3700 0.840698 73.8202 -94.4693
# Worst fragments
repaired_df.nlargest(50, 'Total_Energy').head()
PDB_File Total_Energy Backbone_HBond Sidechain_HBond Van_der_Waals Electrostatics Solvation_Polar Solvation_Hydrophobic
16336 8bg9_pair_084_Repair.pdb 215.356 -34.8066 -10.9174 -51.8010 44.2806 110.281 -59.6230
28803 8bg9_pair_091_Repair.pdb 215.356 -34.8066 -10.9174 -51.8010 44.2806 110.281 -59.6230
11842 8bg9_pair_104_Repair.pdb 210.551 -35.2233 -11.3284 -52.7205 44.2306 109.575 -61.5739
42611 8bg9_pair_097_Repair.pdb 210.551 -35.2233 -11.3284 -52.7205 44.2306 109.575 -61.5739
8658 8bg9_pair_098_Repair.pdb 201.444 -37.8533 -17.3518 -50.9960 34.9611 110.994 -55.5531

Ramachandran

To quickly check whether we found some new shapes which are not present in Eisenberg's dataset, we plot a Ramachandran plot.

# Create a Ramachandran plot of every hexapeptide within the repaired dataset's sample
def extract_phi_psi_angles(folder, sample=None):
    parser = PDBParser(QUIET=True)
    ppb = PPBuilder()
    phi_psi_all = []

    for pdb_file in os.listdir(folder):
        if not pdb_file.endswith(".pdb") or (sample and pdb_file not in sample):
            continue

        structure = parser.get_structure(pdb_file, os.path.join(folder, pdb_file))
        for pp in ppb.build_peptides(structure):
            phi_psi = pp.get_phi_psi_list()
            for phi, psi in phi_psi:
                if phi and psi:  # skip None values (termini)
                    phi_psi_all.append((math.degrees(phi), math.degrees(psi)))
    return phi_psi_all

# === Extract phi/psi angles if not computed already ===
if os.path.exists("./results/phi_psi_angles_sample.npz") and os.path.exists("./results/phi_psi_angles_sample.npz"):
    angles1 = np.load("./results/phi_psi_angles_sample.npz")["angles"]
    angles2 = np.load("./results/phi_psi_angles_templates.npz")["angles"]
else:
    angles1 = extract_phi_psi_angles("fragment_pairs/", set(sample_repaired["PDB_File"].unique())) # only from the sample
    angles2 = extract_phi_psi_angles("templates/")
    np.savez_compressed("./results/phi_psi_angles_sample.npz", angles=np.array(angles1))
    np.savez_compressed("./results/phi_psi_angles_templates.npz", angles=np.array(angles2))
fig, axs = plt.subplots(1, 2, figsize=(12, 6), sharex=True, sharey=True)

phi1, psi1 = zip(*angles1)
axs[0].scatter(phi1, psi1, s=1, alpha=0.008, color="cornflowerblue")
axs[0].set_title(f"Ramachandran Plot - Extracted (n={len(phi1)})")
axs[0].set_xlabel("ϕ")
axs[0].set_ylabel("ψ")
axs[0].grid(False)
for spine in axs[0].spines.values():
    spine.set_color('gray')
axs[0].axvline(0, color="lightgray", linewidth=2, linestyle="-")
axs[0].axhline(0, color="lightgray", linewidth=2, linestyle="-")

phi2, psi2 = zip(*angles2)
axs[1].scatter(phi2, psi2, s=1, alpha=0.1, color="mediumorchid")
axs[1].set_title(f"Ramachandran Plot - Templates (n={len(phi2)})")
axs[1].set_xlabel("ϕ")
axs[1].grid(False)
for spine in axs[1].spines.values():
    spine.set_color('gray')
axs[1].axvline(0, color="lightgray", linewidth=2, linestyle="-")
axs[1].axhline(0, color="lightgray", linewidth=2, linestyle="-")

# Set shared limits
for ax in axs:
    ax.set_xlim(-180, 180)
    ax.set_ylim(-180, 180)

plt.savefig("plots/ramachandran.png", dpi=600, bbox_inches="tight")
plt.savefig("plots/ramachandran.pdf", bbox_inches="tight")
plt.savefig("plots/ramachandran.svg", bbox_inches="tight")
plt.tight_layout()
plt.show()

Sequence detail

To assess each amino-acid's contribution to the stability their structures, we use FoldX's SequenceDetail command.

# Load sequence detail
print ("Loading sequence detail datasets...")
seqdetail_templates = pd.read_csv("./results/foldx_seqdetail_results_templates.csv")
seqdetail = pd.read_csv("./results/foldx_seqdetail_results_fragment_pairs.csv")
## Get rid of non-standardized files
seqdetail = filter_non_standardized(seqdetail)
print("Loaded")

print("Head of Sequence detail dataset of Repaired fragments")
seqdetail.head()
Loading sequence detail datasets...
Loaded
Head of Sequence detail dataset of Repaired fragments
PDB_File Three_Letter Chain Residue_Number Total_Energy BackHBond SideHBond
0 ./SD_2n0a_pair_2691_Repair.pdb ALA C 85 -0.032793 0.000000 0.0
1 ./SD_2n0a_pair_2691_Repair.pdb GLY C 86 0.657411 0.000000 0.0
2 ./SD_8oq4_pair_180_Repair.pdb GLN F 37 0.643152 0.000000 0.0
3 ./SD_2n0a_pair_5190_Repair.pdb H2S F 50 0.514245 -0.485031 0.0
4 ./SD_8uq7_pair_304_Repair.pdb GLY A 335 0.397621 -0.596954 0.0
# Check how many residues in seqdetail come from pairs of NON-5-layer proteins.
percentage=round(100*len(seqdetail[(seqdetail['PDB_File'].str[5:9]).isin(non_five_layer_proteins)])/len(seqdetail), 4)
print(f"{len(seqdetail[(seqdetail['PDB_File'].str[5:9]).isin(non_five_layer_proteins)])}/{len(seqdetail)} ({percentage}%) elements come from NON-5-layered structures")

# Remove unwanted elements from the dataframe
seqdetail = seqdetail[~(seqdetail['PDB_File'].str[5:9]).isin(non_five_layer_proteins)]
print(f"{len(seqdetail)} is the length of the seqdetail dataset after correction.")
151104/10910230 (1.385%) elements come from NON-5-layered structures
10759126 is the length of the seqdetail dataset after correction.
# Distance filtering
seqdetail = seqdetail[(seqdetail['PDB_File'].str[5:]).isin(set(repaired_df["PDB_File"]))]
print(f"After filtering based on length, we have {len(seqdetail)} residues")
After filtering based on length, we have 3716740 residues

Let's check the dataset's redundancies (in sequence).

# TODO: Use already known functions (to get the ordered list of atoms - and then residues)
print("TODO")
TODO

After seqdetail dataset's initial filtering, we check the residue distribution between the sequence detail datasets of both repaired hexapeptide pairs (our repaired extracted fragments) and template structures (Eisenberg's).

seqdetail['Three_Letter'].value_counts()
Three_Letter
GLY    600140
VAL    509335
ALA    356712
LYS    318728
THR    261343
SER    250223
ILE    181325
GLN    179907
LEU    165575
ASN    157148
GLU    141643
PRO    102752
ASP     79595
TYR     75767
H1S     71267
PHE     65764
H2S     55404
MET     53825
CYS     37430
ARG     25908
HIS     18514
TRP      7930
PTR       505
Name: count, dtype: int64
# FIXME Get rid of hetatoms? e.g. ZN
print("Residues and het-atoms in the template dataset")
seqdetail_templates['Three_Letter'].value_counts()
Residues and het-atoms in the template dataset
Three_Letter
GLY    1930
VAL    1760
ALA     950
THR     920
ASN     850
SER     840
LEU     840
ILE     800
TYR     370
PHE     330
GLN     310
MET     300
LYS     240
GLU     130
H1S     111
ZN       70
ASP      40
HIS      32
PRO      30
H2S      27
TRP      20
Name: count, dtype: int64

Total Energies

Let's see the total energy of each individual residue - again comparing ours and Eisenberg's dataset.

desc = seqdetail['Total_Energy'].describe()
print(desc.apply(lambda x: f"{x:.6f}"))
count    3716740.000000
mean           0.489640
std            1.178426
min           -4.176860
25%           -0.152940
50%            0.527413
75%            1.212220
max           14.548800
Name: Total_Energy, dtype: object
desc = seqdetail_templates['Total_Energy'].describe()
print(desc.apply(lambda x: f"{x:.6f}"))
count    10906.000000
mean         0.205430
std          1.178158
min         -4.129810
25%         -0.446808
50%          0.259494
75%          0.908089
max          7.062650
Name: Total_Energy, dtype: object

Our best (lowest total energy) and worst (highest) residues:

# Best residues according to seqdetail energy estimation
best_repaired = repaired_df.nsmallest(100, "Total_Energy").head() # TODO

seqdetail.nsmallest(100, "Total_Energy").head()
PDB_File Three_Letter Chain Residue_Number Total_Energy BackHBond SideHBond
3577542 ./SD_8r47_pair_205_Repair.pdb ARG C 28 -4.17686 -0.247444 -1.69147
7354890 ./SD_8r47_pair_199_Repair.pdb ARG C 28 -4.17686 -0.247444 -1.69147
4467223 ./SD_7u11_pair_006_Repair.pdb TYR A 125 -4.08744 -1.371750 -1.56968
650860 ./SD_9fnb_pair_004_Repair.pdb TYR B 125 -3.96797 -1.542940 -1.04820
7055515 ./SD_9fnb_pair_005_Repair.pdb TYR B 125 -3.95643 -1.549240 -1.04820
# Worst residues according to seqdetail energy estimation
worst_repaired = repaired_df.nlargest(100, "Total_Energy").head() # TODO

seqdetail.nlargest(100, "Total_Energy").head()
PDB_File Three_Letter Chain Residue_Number Total_Energy BackHBond SideHBond
4182516 ./SD_8bg9_pair_085_Repair.pdb GLU I 22 14.5488 -0.687146 0.0
7962205 ./SD_8bg9_pair_092_Repair.pdb GLU I 22 14.5488 -0.687146 0.0
805688 ./SD_8bg9_pair_091_Repair.pdb GLU A 22 14.3261 -0.622249 0.0
1120332 ./SD_8bg9_pair_084_Repair.pdb GLU A 22 14.3261 -0.622249 0.0
4180836 ./SD_8bg9_pair_085_Repair.pdb GLU A 22 14.2986 -0.626909 0.0

To see which residues make our worst extracted hexapeptides pairs bad, we take a sample which consists of those whose total energy (estimated with Stability command) is among the 1000 largest. We compare those bad fragments with all of our fragments.

# Label bad fragment's sequence detail elements (highest energy fragments according to Stability)
print(len(repaired_df[repaired_df['Total_Energy'] >= 80]))
print(f"{100*len(repaired_df[repaired_df['Total_Energy'] >= 80])/len(repaired_df)} %")

highest_energy_repaired = repaired_df.nlargest(800, "Total_Energy")
seqdetail_bad = seqdetail[(seqdetail["PDB_File"].str[5:]).isin(highest_energy_repaired["PDB_File"])]
# Add a category column to differentiate both datasets
seqdetail.loc[:, "Category"] = "All Fragments"
seqdetail_bad = seqdetail_bad.copy()
seqdetail_bad.loc[:, "Category"] = "Bad Fragments"
799
1.2897289793546514 %
paths = set(seqdetail_bad["PDB_File"].str[5:].unique())
with open("pdb_file_list_bad_test.txt", "w") as f:
    for pdb in paths:
        f.write(pdb + "\n")
# Then use:
# rsync -av --progress --partial --append-verify --files-from=pdb_file_list_bad_test.txt meta:/storage/praha1/home/tobiasma/switch-lab/amyloid-interactions/fragment_pairs/ ./fragment_pairs/
# Define amino acid groups (uppercase)
amino_acid_groups = {
    "Hydrophobic": ["ALA", "VAL", "LEU", "ILE", "MET", "PHE", "TRP", "PRO"],
    "Polar": ["SER", "THR", "ASN", "GLN", "TYR", "CYS", "GLY"],  # Added Glycine to Polar
    "Charged_Positive": ["LYS", "ARG", "HIS"],
    "Charged_Negative": ["ASP", "GLU"]
}

# Define the custom order for x-axis (grouped)
custom_order = (
    amino_acid_groups["Hydrophobic"] +
    amino_acid_groups["Polar"] +
    amino_acid_groups["Charged_Positive"] +
    amino_acid_groups["Charged_Negative"]
)

# Define color palette for groups
group_colors = {
    "Hydrophobic": "yellow", # #bdc6d5
    "Polar": "cyan", # #e7c65b
    "Charged_Positive": "red", # #4c99cf
    "Charged_Negative": "blue", # #df6b6a
    "Unknown": "gray"
}
# Merge both datasets
combined_data = pd.concat([seqdetail, seqdetail_bad])

# Assign the "Unknown" group for amino acids not in the predefined groups
all_amino_acids = set(combined_data["Three_Letter"].unique())  # Get unique amino acids from data
known_amino_acids = set(sum(amino_acid_groups.values(), []))  # Flatten list of known amino acids
amino_acid_groups["Unknown"] = list(all_amino_acids - known_amino_acids) # Find any unknown amino acids
custom_order += (amino_acid_groups["Unknown"])

aa_color_map = {aa: group_colors[group] for group, amino_acids in amino_acid_groups.items() for aa in amino_acids}
plt.figure(figsize=(18, 6))

# Create a violin plot with split violins
sns.violinplot(
    x="Three_Letter", y="Total_Energy", hue="Category", 
    data=combined_data, split=True, inner="quartile",
    palette={"All Fragments": "cornflowerblue", "Bad Fragments": "lightcoral"},
    order=custom_order
)

plt.xlabel("Amino Acid")
plt.ylabel("Total Energy")
plt.title("Total Energy Contribution of Each Amino Acid of All Fragments and 100 Worst Fragments")
plt.legend(title="Fragment Type")
plt.xticks(rotation=45)
plt.ylim(-6, 13.5)

ax = plt.gca()
handles, labels = ax.get_legend_handles_labels()
hue_labels = {
    "All Fragments": f"All Fragments (n={len(seqdetail)} Res)",
    "Bad Fragments": f"Bad Fragments (n={len(seqdetail_bad)} Res)"
}
new_labels = [hue_labels.get(label, label) for label in labels]
ax.legend(handles, new_labels, title="Fragment Type")

for label in ax.get_xticklabels():
    aa = label.get_text()
    color = aa_color_map.get(aa)
    label.set_bbox(dict(facecolor=color, edgecolor='none', boxstyle='round,pad=0.3', alpha=0.5)) # Apply background color

plt.show()

Let's plot the estimated energy of each residue to compare the datasets.

seqdetail_templates.loc[:, "Category"] = "Template Fragments"

plt.figure(figsize=(10, 6))

# Merge both dataset (the whole seqdetail and template's seqdetail)
combined_data = pd.concat([seqdetail_templates, seqdetail])
combined_data = combined_data.reset_index(drop=True)

# kde plot of seqdetail templates vs seqdetail repaired total energy (of all residues, not-type specific)
sns.kdeplot(
    data=combined_data, x="Total_Energy",
    hue="Category",
    fill=True, common_norm=False,
    palette={"Template Fragments": "mediumorchid", "All Fragments": "cornflowerblue"},
    alpha=0.5
)

plt.axvline(0, color="lightgray", linewidth=2, linestyle="--")
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_color('dimgray')
plt.gca().spines['bottom'].set_color('dimgray')
plt.gca().set_yticks([0, 0.1, 0.2, 0.3, 0.4])

plt.xlabel("Total Energy")
plt.ylabel("Density")
plt.grid(False)
plt.xlim(-6, 6)
plt.title("Per-residue Total Energy Distribution of Extracted Fragments and Template Fragments")
plt.savefig("plots/total_energy_kde_residues.png", dpi=600, bbox_inches="tight")
plt.savefig("plots/total_energy_kde_residues.pdf", bbox_inches="tight")
plt.savefig("plots/total_energy_kde_residues.svg", bbox_inches="tight")
plt.show()

Relative Solvent Accessibility

We compute and compare the Relative Solvent Accessibility (RSA) of our bad sample's residues and the residues from structures in our repaired_df sample.

# Biopython's solvent accessibility per residue (one number per residue)

AA_TO_MAX_SASA_MAP = {
"ALA": 129,
"ARG": 274,
"ASN": 195,
"ASP": 193,
"CYS": 167,
"GLN": 223,
"GLU": 225,
"GLY": 104,
"HIS": 224,
"ILE": 197,
"LEU": 201,
"LYS": 236,
"MET": 224,
"PHE": 240,
"PRO": 159,
"SER": 155,
"THR": 155,
"TRP": 285,
"TYR": 263,
"VAL": 174
}

def clip_value(lower, value, upper):
    return lower if value < lower else upper if value > upper else value

def remove_heteroatoms(structure):
    # Cannot delete heteroatoms (which are considered as residues in biopython) while iterating over structure.get_residues(), so we first need to get a list of their ids
    chain_to_heteroatoms_map = defaultdict(list)

    model = structure[0]

    for residue in model.get_residues():
        if not is_aa(residue, standard=True):
            chain_to_heteroatoms_map[residue.parent.id].append(residue.id)

    for chain, heteroatoms in chain_to_heteroatoms_map.items():
        try:
            chain_object = model[chain]
        except KeyError:
            print(f'Chain {chain} not found in structure.')
            print(f'Chains in structure: {[chain.id for chain in model.get_chains()]}')
            print(f'Chain to heteroatoms map: {chain_to_heteroatoms_map}')
            raise
        for heteroatom in heteroatoms:
            chain_object.detach_child(heteroatom)
    return

def calculate_residue_RSA(residue):
    residue_sasa, residue_max_sasa = residue.sasa, AA_TO_MAX_SASA_MAP[residue.resname]
    RSA = (residue_sasa / residue_max_sasa) * 100
    return round(RSA, ndigits = None) # Returns an integer type, which is what we want because the RSA probabilities table is set up using bins of size 1.

def get_residue_RSA_values(PDB_file_path, full_residue_IDs_list, chain_subset=None):
    """
    The RSA (Relative Solvent Accessibility) values of the list of full residue IDs are calculated in the context of the provided chain subset. For example, if the
    full PDB is an antibody antigen complex with chains AHL, chain_subset = 'HL' will calculate RSA values of residues in the HL complex context and ignore
    chain A. This is important to calculate the RSA values of interface residues correctly.
    """
    parser = PDBParser(QUIET = True)
    structure = parser.get_structure('', PDB_file_path)

    remove_heteroatoms(structure) # If not removed, heteroatoms would be accounted for when calculating the SASA of each residue.

    if chain_subset is not None:
        chains_to_remove = set(chain.id for chain in structure.get_chains()) - set(chain_subset)
        for chain_ID in chains_to_remove:
            structure.detach_child(chain_ID)

    sasa_calculator = SASA.ShrakeRupley()
    sasa_calculator.compute(structure, level='R')

    full_residue_IDs_set = set(full_residue_IDs_list)
    full_residue_ID_to_RSA_map = {}
    for residue in structure.get_residues():
        AA = protein_letters_3to1[residue.resname.title()]
        structure_ID, model_ID, chain_ID, (heteroatom_flag, position, icode) = residue.full_id # Biopython residue ID

        full_residue_ID = f'{AA}{chain_ID}{position}'
        if full_residue_ID not in full_residue_IDs_set:
            continue
        residue_RSA = calculate_residue_RSA(residue)
        full_residue_ID_to_RSA_map[full_residue_ID] = clip_value(lower=0, value=residue_RSA, upper=99) # The RSA bins in the probability table range between 0 and 99.

    residue_RSA_values = [full_residue_ID_to_RSA_map[full_residue_ID] for full_residue_ID in full_residue_IDs_list]
    # check if it doesnt contain none values:
    if None in residue_RSA_values:
        print(f'Full residue IDs list: {residue_RSA_values}')
        print(f'Full residue ID to RSA map: {full_residue_ID_to_RSA_map}')
        print(PDB_file_path, structure.get_residues())

    if len(residue_RSA_values) != len(full_residue_IDs_list):
        print(f'Full residue IDs list: {full_residue_IDs_list}')
        print(f'Full residue ID to RSA map: {full_residue_ID_to_RSA_map}')
        print(PDB_file_path, structure.get_residues())
        raise ValueError('Some residues do not have RSA values calculated.')

    return residue_RSA_values

def process_pdb_RSA(pdb_file, directory):
    parser = PDBParser(QUIET=True)
    try:
        structure = parser.get_structure(pdb_file, f"{directory}/{pdb_file}")
    except PDBExceptions.PDBConstructionException as e:
        print(f"Failed to parse {pdb_file}: {e}")
        return pdb_file, []

    residue_ID_list = []
    residue_key = []
    model = structure[0]  # Assuming we want the first model
    for chain in model:
        for residue in chain:
            structure_ID, model_ID, chain_ID, (heteroatom_flag, position, icode) = residue.get_full_id()
            try:
                residue_ID_list.append(f"{protein_letters_3to1[residue.resname.title()]}{chain_ID}{position}")
            except KeyError:
                continue
            residue_key.append((chain_ID, position))

    RSA_values = get_residue_RSA_values(f"{directory}/{pdb_file}", residue_ID_list)
    result_list = []
    for i, residue_ID in enumerate(residue_ID_list):
        chain_ID, position = residue_key[i]
        result_list.append((chain_ID, position, RSA_values[i]))

    return pdb_file, result_list

def get_residue_RSA_values_from_df(seqdetail_df, directory="./fragment_pairs", max_workers=8):
    """
    Parallel RSA (Relative Solvent Accessibility) calculation for residues in the provided dataframe.
    """
    seqdetail_df = seqdetail_df.copy()
    seqdetail_df['PDB_File'] = seqdetail_df['PDB_File'].str[5:]
    pdb_files = seqdetail_df['PDB_File'].unique()

    updates = []  # collect updates here

    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_pdb_RSA, pdb_file, directory): pdb_file for pdb_file in pdb_files}
        for future in as_completed(futures):
            pdb_file, results = future.result()
            for chain_ID, position, rsa_value in results:
                updates.append({
                    'PDB_File': pdb_file,
                    'Chain': chain_ID,
                    'Residue_Number': position,
                    'RSA': rsa_value
                })

    # Create DataFrame from updates
    updates_df = pd.DataFrame(updates)

    # Merge in the new RSA values
    seqdetail_df = seqdetail_df.merge(
        updates_df,
        on=['PDB_File', 'Chain', 'Residue_Number'],
        how='left'
    )

    return seqdetail_df
# Create a seqdetail's sample
K = seqdetail[(seqdetail["PDB_File"].str[5:]).isin(sample_repaired["PDB_File"])]
## Label seqdetail categories
K.loc[:, "Category"] = "Sampled Fragments"

# Get the residue RSA for each dataset
if os.path.exists("./results/templates_with_rsa.csv") and os.path.exists("./results/sample_with_rsa.csv") and os.path.exists("./results/bad_with_rsa.csv"):
    seqdetail_templates_with_RSA = pd.read_csv("./results/templates_with_rsa.csv")
    seqdetail_sample_with_RSA = pd.read_csv("./results/sample_with_rsa.csv")
    seqdetail_bad_with_RSA = pd.read_csv("./results/bad_with_rsa.csv")
else:
    seqdetail_templates_with_RSA = get_residue_RSA_values_from_df(seqdetail_templates, directory="./templates")
    seqdetail_sample_with_RSA = get_residue_RSA_values_from_df(K)
    seqdetail_bad_with_RSA = get_residue_RSA_values_from_df(seqdetail_bad)
    seqdetail_templates_with_RSA.to_csv("./results/templates_with_rsa.csv", index=False)
    seqdetail_sample_with_RSA.to_csv("./results/sample_with_rsa.csv", index=False)
    seqdetail_bad_with_RSA.to_csv("./results/bad_with_rsa.csv", index=False)
len(seqdetail_sample_with_RSA)
738195
plt.figure(figsize=(18, 6))

# Merge both seqdetail datasets (bad and sampled fragments)
combined_data = pd.concat([seqdetail_sample_with_RSA, seqdetail_bad_with_RSA], ignore_index=True)

# Create a violin plot with split violins
sns.violinplot(
    x="Three_Letter", y="RSA", hue="Category", 
    data=combined_data, split=True, inner="quartile",
    palette={"Sampled Fragments": "cornflowerblue", "Bad Fragments": "lightcoral"},
    order=custom_order,
    cut=0 # Do not estimate KDE out of possible range (make it strictly 0-100)
)

plt.xlabel("Amino Acid")
plt.ylabel("RSA")
plt.title("RSA of Residues in Fragments With the Highest Total Energy Compared to Sampled Fragments")
plt.legend(title="Fragment Type")
plt.xticks(rotation=45)
plt.axhline(0, color="lightgray", linewidth=2)
plt.axhline(100, color="lightgray", linewidth=2)

ax = plt.gca()
handles, labels = ax.get_legend_handles_labels()
hue_labels = {
    "Sampled Fragments": f"Sampled Fragments (n={len(seqdetail_sample_with_RSA)} Res)",
    "Bad Fragments": f"Bad Fragments (n={len(seqdetail_bad_with_RSA)} Res)"
}
new_labels = [hue_labels.get(label, label) for label in labels]
ax.legend(handles, new_labels, title="Fragment Type")

for label in ax.get_xticklabels():
    aa = label.get_text()
    color = aa_color_map.get(aa)
    label.set_bbox(dict(facecolor=color, edgecolor='none', boxstyle='round,pad=0.3', alpha=0.5)) # Apply background color

plt.show()

We compare the sample of our dataset to Eisenberg's as well.

plt.figure(figsize=(18, 6))

# Merge both datasets (sampled and seqdetail from templates)
combined_data = pd.concat([seqdetail_sample_with_RSA, seqdetail_templates_with_RSA])
combined_data = combined_data.reset_index(drop=True)

# Create a violin plot with split violins
sns.violinplot(
    x="Three_Letter", y="RSA", hue="Category", 
    data=combined_data, split=True, inner="quartile",
    palette={"Sampled Fragments": "cornflowerblue", "Template Fragments": "mediumorchid"},
    alpha=0.8, saturation=1,
    order=custom_order, cut=0 # Do not estimate KDE out of possible range (make it strictly 0-100)
)

plt.xlabel("Amino Acid")
plt.ylabel("RSA")
plt.title("RSA Comparison of Residues From Extracted Fragments and Template Fragments")
plt.legend(title="Fragment Type")
plt.xticks(rotation=45)

plt.gca().set_yticks([0, 20, 40, 60, 80, 100])
plt.ylim(0, 100)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_color('dimgray')
plt.gca().spines['bottom'].set_color('dimgray')
plt.grid(False)

ax = plt.gca()
handles, labels = ax.get_legend_handles_labels()
hue_labels = {
    "Sampled Fragments": f"Sampled Fragments (n={len(seqdetail_sample_with_RSA)} Res)",
    "Template Fragments": f"Template Fragments (n={len(seqdetail_templates_with_RSA)} Res)"
}
new_labels = [hue_labels.get(label, label) for label in labels]
ax.legend(handles, new_labels, title="Fragment Type")

for label in ax.get_xticklabels():
    aa = label.get_text()
    color = aa_color_map.get(aa)
    label.set_bbox(dict(facecolor=color, edgecolor='none', boxstyle='round,pad=0.3', alpha=0.5)) # Apply background color


plt.savefig("plots/rsa_violin_all_residues.png", dpi=600, bbox_inches="tight")
plt.savefig("plots/rsa_violin_all_residues.pdf", bbox_inches="tight")
plt.savefig("plots/rsa_violin_all_residues.svg", bbox_inches="tight")
plt.show()

We divide the residues in 3 categories - Stabilizing, Neutral and Destabilizing. Stabilizing residues have estimated total energies < -0.5, Destabilizing > 0.5 and Neutral all the values between. Let's see whether the distributions match across datasets.

# Create a new column with default value (optional, but good practice)
seqdetail_templates_with_RSA["Stability"] = "Unknown"
seqdetail_sample_with_RSA["Stability"] = "Unknown"

# Assign values using .loc
seqdetail_templates_with_RSA.loc[seqdetail_templates_with_RSA["Total_Energy"] < -0.5, "Stability"] = "Stabilizing"
seqdetail_templates_with_RSA.loc[seqdetail_templates_with_RSA["Total_Energy"] >= -0.5, "Stability"] = "Neutral"
seqdetail_templates_with_RSA.loc[seqdetail_templates_with_RSA["Total_Energy"] > 0.5, "Stability"] = "Destabilizing"

seqdetail_sample_with_RSA.loc[seqdetail_sample_with_RSA["Total_Energy"] < -0.5, "Stability"] = "Stabilizing"
seqdetail_sample_with_RSA.loc[seqdetail_sample_with_RSA["Total_Energy"] >= -0.5, "Stability"] = "Neutral"
seqdetail_sample_with_RSA.loc[seqdetail_sample_with_RSA["Total_Energy"] > 0.5, "Stability"] = "Destabilizing"

# Combine the DataFrames
combined_data = pd.concat([seqdetail_templates_with_RSA, seqdetail_sample_with_RSA], ignore_index=True)
plt.figure(figsize=(10, 6))

# sns.violinplot of destabilizing and stabilizing RSA
sns.violinplot(
    x="Stability", y="RSA", hue="Category",
    data=combined_data, split=True, inner="quartile",
    palette={"Sampled Fragments": "cornflowerblue", "Template Fragments": "mediumorchid"},
    order=["Stabilizing", "Neutral", "Destabilizing"],
    alpha=0.8, saturation=1,
    cut=0
)
plt.xlabel("Stability")
plt.ylabel("RSA")
plt.title("RSA Comparison of Different Types of Residues between Sampled and Template Fragments")
plt.legend(title="Fragment Type")

plt.gca().set_yticks([0, 20, 40, 60, 80, 100])
plt.ylim(0, 100)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_color('dimgray')
plt.gca().spines['bottom'].set_color('dimgray')
plt.grid(False)

ax = plt.gca()
handles, labels = ax.get_legend_handles_labels()
new_labels = [hue_labels.get(label, label) for label in labels]
ax.legend(handles, new_labels, title="Fragment Type")

plt.savefig("plots/rsa_stabilizing_destabilizing_residues.png", dpi=600, bbox_inches="tight")
plt.savefig("plots/rsa_stabilizing_destabilizing_residues.pdf", bbox_inches="tight")
plt.savefig("plots/rsa_stabilizing_destabilizing_residues.svg", bbox_inches="tight")
plt.show()

Hydrogen Bonds

We compare the hydrogen bonds as well. Backbone and Side-chain.

plt.figure(figsize=(10, 6))

# Merge both dataset (the whole seqdetail and template's seqdetail)
combined_data = pd.concat([seqdetail_templates, seqdetail])
combined_data = combined_data.reset_index(drop=True)

# kde plot of seqdetail templates vs seqdetail repaired backhbond (of all residues, not-type specific)
sns.kdeplot(
    data=combined_data, x="BackHBond",
    hue="Category",
    fill=True, common_norm=False,
    palette={"Template Fragments": "mediumorchid", "All Fragments": "cornflowerblue"},
    alpha=0.5
)
plt.xlabel("Backbone Hydrogen Bond")
plt.ylabel("Density")
plt.title("Per-residue BackHBond Distribution of Extracted Fragments and Template Fragments")

plt.xlim(-2.5, 0.5)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_color('dimgray')
plt.gca().spines['bottom'].set_color('dimgray')
plt.grid(False)

plt.savefig("plots/backbone_h_bond_kde.png", dpi=600, bbox_inches="tight")
plt.savefig("plots/backbone_h_bond_kde.pdf", bbox_inches="tight")
plt.savefig("plots/backbone_h_bond_kde.svg", bbox_inches="tight")
plt.show()
plt.figure(figsize=(10, 6))

# kde plot of seqdetail templates vs seqdetail repaired sidehbond (of all residues, not-type specific)
sns.kdeplot(
    data=combined_data, x="SideHBond",
    hue="Category",
    fill=True, common_norm=False,
    palette={"Template Fragments": "mediumorchid", "All Fragments": "cornflowerblue"},
    alpha=0.5
)
plt.xlabel("Sidechain Hydrogen Bond")
plt.ylabel("Density")
plt.title("Per-residue SideHBond Distribution of Extracted Fragments and Template Fragments")

plt.gca().set_yticks([0, 4, 8, 12, 16])
plt.xlim(-2.5, 0.5)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_color('dimgray')
plt.gca().spines['bottom'].set_color('dimgray')
plt.grid(False)

plt.savefig("plots/sidechain_h_bond_kde.png", dpi=600, bbox_inches="tight")
plt.savefig("plots/sidechain_h_bond_kde.pdf", bbox_inches="tight")
plt.savefig("plots/sidechain_h_bond_kde.svg", bbox_inches="tight")
plt.show()

Burried Surface Area

We compute the Burried Surface Area (BSA) of Eisenberg's structures and structures from our sample.

# Burried Surface Area (one number per fragment)

import io
from collections import defaultdict
from Bio.PDB import PDBParser, PDBIO, Select

from utils.peptides import get_chains_from_stacks
from utils.peptides import ordered_stacks_split_multi_pp

class ChainSelect(Select):
    def __init__(self, chain_ids):
        self.chain_ids = set(chain_ids)
    def accept_chain(self, chain):
        return chain.id in self.chain_ids

def calculate_structure_SASA(structure):
    # Print all the chains in the structure
    # print(f"Calculating SASA for structure with chains: {[chain.id for chain in structure.get_chains()]}")
    sasa_calculator = SASA.ShrakeRupley()
    sasa_calculator.compute(structure, level='R')
    #pdb_io = PDBIO()
    #pdb_io.set_structure(structure)
    #pdb_io.save(f"temp_{structure.id}.pdb", select=ChainSelect(set([chain.id for chain in structure.get_chains()])))  # Save the structure to a temporary file
    #print(len(list(structure.get_residues())))
    return sum(residue.sasa for residue in structure.get_residues())

def get_unbound_structure(original_structure, subset_chains):
    pdb_io = PDBIO() 
    pdb_parser = PDBParser(QUIET=True)
    buf = io.StringIO()
    pdb_io.set_structure(original_structure)
    pdb_io.save(buf, select=ChainSelect(subset_chains))
    #pdb_io.save(f"unbound_{'_'.join(subset_chains)}{original_structure}.pdb", select=ChainSelect(subset_chains))
    buf.seek(0) # FIXME - using this in parallel is dangerous
    return pdb_parser.get_structure('', buf)

def calculate_BSA(structure, chain_subset_1, chain_subset_2):
    remove_heteroatoms(structure)

    # Filter to only chains of interest (both subsets)
    model = structure[0]
    combined_subset = set(chain_subset_1) | set(chain_subset_2)
    chains_to_remove = set(chain.id for chain in structure.get_chains()) - combined_subset
    #print(chain_subset_1, chain_subset_2, "END")

    for chain_ID in chains_to_remove:
        if chain_ID != ' ': # FIXME what is this?? It is present in the models chains
            try:
                model.detach_child(chain_ID)
            except Exception as e:
                raise KeyError(f"idk, {e}, {set(chain.id for chain in structure.get_chains())}, {combined_subset}")

    #print("Calculate SASA")
    # Calculate bound SASA
    #full_struct = get_unbound_structure(structure, combined_subset)
    bound_SASA = calculate_structure_SASA(structure)

    # Calculate unbound SASAs
    unbound_1 = get_unbound_structure(structure, chain_subset_1)
    unbound_SASA_1 = calculate_structure_SASA(unbound_1)

    unbound_2 = get_unbound_structure(structure, chain_subset_2)
    unbound_SASA_2 = calculate_structure_SASA(unbound_2)

    # Total unbound SASA (sum of both)
    # print(f"Unbound SASA 1: {unbound_SASA_1} + Unbound SASA 2: {unbound_SASA_2}")
    unbound_SASA_total = unbound_SASA_1 + unbound_SASA_2

    # Buried surface area
    # print(f"Unbound SASA Total: {unbound_SASA_total} - Bound SASA: {bound_SASA}")
    buried_surface_area = unbound_SASA_total - bound_SASA

    return unbound_SASA_total, bound_SASA, buried_surface_area

def process_pdb_BSA(pdb_file, directory):
    pdb_parser = PDBParser(QUIET=True)
    try:
        print(f"Processing {pdb_file} for BSA calculation...")
        structure = pdb_parser.get_structure(pdb_file, f"{directory}/{pdb_file}")
    except Exception as e:
        print(f"Failed to parse {pdb_file}: {e}")
        return pdb_file, None, None, None

    #print(structure.id, "Chains:", [chain.id for chain in structure.get_chains()])
    ordered_stacks, peptides, split_struct = ordered_stacks_split_multi_pp(structure, expected_num_of_stacks=2, max_attempts=10)

    #print(split_struct.id, "Chains:", [chain.id for chain in split_struct.get_chains()])
    stack_chains = get_chains_from_stacks(peptides, ordered_stacks)

    if len(stack_chains) != 2:
        print(f"Expected different number of stacks, got {len(stack_chains)}, {ordered_stacks}")
        print(f"Skipping {pdb_file}")
        return pdb_file, None, None, None

    unbound_SASA_total, bound_SASA, buried_surface_area = calculate_BSA(split_struct, stack_chains[0], stack_chains[1])
    return pdb_file, buried_surface_area, bound_SASA, unbound_SASA_total



def get_BSA_from_df(fragment_df, directory="./fragment_pairs", max_workers=8):
    fragment_df = fragment_df.copy()
    pdb_files = fragment_df['PDB_File'].unique()

    updates = []

    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_pdb_BSA, pdb_file, directory): pdb_file for pdb_file in pdb_files}
        for future in as_completed(futures):
            pdb_file, buried_surface_area, bound_SASA, unbound_SASA_total = future.result()
            if buried_surface_area is not None:
                updates.append({
                    'PDB_File': pdb_file,
                    'BSA': buried_surface_area,
                    'Bound_SASA': bound_SASA,
                    'Unbound_SASA': unbound_SASA_total
                })

    # Create updates DataFrame
    updates_df = pd.DataFrame(updates)
    
    # Merge in new values
    fragment_df = fragment_df.merge(updates_df, on='PDB_File', how='left')

    return fragment_df
# Get BSA for the template dataset
if os.path.exists("./results/templates_with_bsa.csv"):
    templates_with_BSA = pd.read_csv("./results/templates_with_bsa.csv")
else:
    templates_with_BSA = get_BSA_from_df(template_df, directory="./templates")
    templates_with_BSA.to_csv("./results/templates_with_bsa.csv", index=False)

templates_with_BSA.head()
PDB_File Total_Energy Backbone_HBond Sidechain_HBond Van_der_Waals Electrostatics Solvation_Polar Solvation_Hydrophobic BSA Bound_SASA Unbound_SASA
0 1yjo.pdb -27.20710 -41.6648 -70.69830 -80.8205 -7.77834 164.9760 -84.2058 1430.331422 3589.657794 5019.989216
1 1yjp_1.pdb 6.20392 -45.4826 -70.98320 -65.0946 6.56874 113.7480 -63.5063 1472.207771 3359.014188 4831.221959
2 1yjp_2.pdb -14.26580 -50.0210 -72.01630 -80.3370 2.57071 126.6720 -83.8945 1433.401072 3552.675946 4986.077018
3 2kib_1.pdb 16.43380 -44.0706 0.00000 -43.1552 -3.11863 68.1515 -59.0119 1073.027307 4801.665177 5874.692484
4 2kib_2.pdb 17.88910 -33.9384 -1.84035 -41.7314 1.71712 55.3471 -57.0966 929.267625 4687.238910 5616.506535

Some of the template structures failed the calculation (have NaNs). This is mostly due to their weird shape and therefore incorrect identification of stacks (sides of the hexpeptide fragment pairs) with our algorithm.

# Mostly weird fragments which our alg doesn't know how to handle
# only 2y3l_3 and 5e5z should be computable
# (with looser threshold - but it is just 2 sturcts)
templates_with_BSA[templates_with_BSA['BSA'].isna()]
PDB_File Total_Energy Backbone_HBond Sidechain_HBond Van_der_Waals Electrostatics Solvation_Polar Solvation_Hydrophobic BSA Bound_SASA Unbound_SASA
30 2y3l_3.pdb 38.711100 -23.52490 0.00000 -27.1616 -0.103731 42.3114 -36.6882 NaN NaN NaN
111 4uby.pdb -0.626345 -31.55220 0.00000 -48.7639 -3.787230 67.9743 -68.0499 NaN NaN NaN
118 4w5p_2.pdb -8.510380 -37.39760 -6.91971 -48.4529 -3.225380 63.8204 -66.1297 NaN NaN NaN
141 5e5v_2.pdb 7.228250 -30.64120 -5.05200 -51.7335 2.772190 68.4774 -72.0727 NaN NaN NaN
143 5e5z.pdb 23.775900 -32.34680 -19.45090 -44.6157 -1.399470 77.1989 -52.8902 NaN NaN NaN
175 6cfh_1.pdb 57.066000 -13.72370 -3.42594 -22.1422 5.805420 38.5544 -29.6550 NaN NaN NaN
176 6cfh_2.pdb 15.489300 -33.83700 0.00000 -50.5679 2.129130 66.7553 -72.3968 NaN NaN NaN
179 6cfh_5.pdb 13.991000 -22.49910 -1.86111 -39.6010 1.458860 42.7467 -58.5108 NaN NaN NaN
180 6cfh_6.pdb 55.837500 -3.76357 -7.22857 -11.8741 3.006000 21.9272 -15.7097 NaN NaN NaN

Simple summary of the template's BSA values:

desc = templates_with_BSA['BSA'].describe()
print(desc.apply(lambda x: f"{x:.6f}"))
count     172.000000
mean     1055.313185
std       448.291994
min        -0.000000
25%       841.395469
50%      1118.878224
75%      1394.837274
max      1954.759795
Name: BSA, dtype: object
# BSA is an approximation - these values are very close to zero (and should be zero, or little above)
print(f"{len(templates_with_BSA.loc[(templates_with_BSA['BSA'] < 0)])} template pairs are below 0.")
print(f"{len(templates_with_BSA[(templates_with_BSA['BSA'] < 0) & (templates_with_BSA['BSA'] > -1)])} are approx errors. Removing them..")

# Set approximation errors (negative BSA values) to zero
templates_with_BSA.loc[(templates_with_BSA['BSA'] < 0) & (templates_with_BSA['BSA'] > -1), 'BSA'] = 0
4 template pairs are below 0.
4 are approx errors. Removing them..

Calculate BSA of our sample.

# Get BSA for the repaired sample dataset
if os.path.exists("./results/sample_with_bsa.csv"):
    sample_with_BSA = pd.read_csv("./results/sample_with_bsa.csv")
else:
    sample_with_BSA = get_BSA_from_df(sample_repaired, "./fragment_pairs")
    sample_with_BSA.to_csv("./results/sample_with_bsa.csv", index=False)

sample_with_BSA.head()
PDB_File Total_Energy Backbone_HBond Sidechain_HBond Van_der_Waals Electrostatics Solvation_Polar Solvation_Hydrophobic BSA Bound_SASA Unbound_SASA
0 7qkj_pair_765_Repair.pdb 18.45160 -30.7581 -8.22142 -42.7342 3.79509 61.2674 -53.1292 1075.330095 3265.915691 4341.245786
1 8olq_pair_012_Repair.pdb 50.32270 -36.4475 -31.34510 -59.5809 11.08980 101.5600 -67.5650 926.019646 3922.125508 4848.145153
2 8ci8_pair_225_Repair.pdb 12.05720 -36.2584 -6.14683 -67.4686 4.42839 89.9917 -93.9525 1567.003055 3808.659626 5375.662681
3 7nrq_pair_279_Repair.pdb 49.53490 -31.8512 -9.81061 -45.1977 3.33110 86.0854 -51.5352 1483.879077 3417.488690 4901.367767
4 7qkk_pair_121_Repair.pdb 8.61727 -46.0144 -19.36770 -57.6874 6.74647 89.7038 -71.9427 1072.009079 4046.350113 5118.359192
print(f"{len(sample_with_BSA[sample_with_BSA['BSA'].isna()])}/{len(sample_repaired)} have failed during the BSA calculation.")
sample_with_BSA[sample_with_BSA['BSA'].isna()]
0/12304 have failed during the BSA calculation.
PDB_File Total_Energy Backbone_HBond Sidechain_HBond Van_der_Waals Electrostatics Solvation_Polar Solvation_Hydrophobic BSA Bound_SASA Unbound_SASA

Simple summary of sample's BSA values:

desc = sample_with_BSA['BSA'].describe()
print(desc.apply(lambda x: f"{x:.6f}"))
count    12304.000000
mean      1195.089564
std        209.751344
min        499.762813
25%       1035.606377
50%       1185.562046
75%       1335.289928
max       2297.255447
Name: BSA, dtype: object

Comparison of BSA across datasets:

plt.figure(figsize=(10,6))

# Merge
templates_with_BSA['Category'] = "Template Fragments"
sample_with_BSA['Category'] = "Sampled Fragments"
combined_data = pd.concat([templates_with_BSA, sample_with_BSA], ignore_index=True)

# Plot a comparison of Burried Surface Accessibility
sns.kdeplot(
    data=combined_data, x="BSA",
    hue="Category",
    cut=0,
    fill=True, common_norm=False,
    palette={"Template Fragments": "mediumorchid", "Sampled Fragments": "cornflowerblue"},
    alpha=0.5
)
plt.xlabel("Burried Surface Accessibility")
plt.ylabel("Density")
plt.title(f"Burried Surface Accessibility of Extracted Fragments and Template Fragments\nTemplate n={len(templates_with_BSA)}, Sampled n={len(sample_with_BSA)}")

plt.gca().set_yticks([0, 0.001, 0.002])
#plt.xlim(-2.5, 0.5)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_color('dimgray')
plt.gca().spines['bottom'].set_color('dimgray')
plt.grid(False)

plt.savefig("plots/bsa_kde.png", dpi=600, bbox_inches="tight")
plt.savefig("plots/bsa_kde.pdf", bbox_inches="tight")
plt.savefig("plots/bsa_kde.svg", bbox_inches="tight")
plt.show()

Packing/Stacking Energies

We compare the dataset's packing (energy between the two stacks of each structure) and stacking (energy between the first two and last three layers in each stack) energies.

# Load results from Analyse complex for templates
ac_templates = pd.read_csv("./results/foldx_analyse_complex_results_templates.csv")
ac_templates['ID'] = ac_templates['PDB_File'].str.split('_').apply(lambda parts: '_'.join(parts[:-2]))
ac_templates['Category'] = "Templates"
ac_templates.head()
PDB_File Group1 Group2 IntraclashesGroup1 IntraclashesGroup2 Interaction_Energy BackHBond SideHBond VDWClashes ID Category
0 3q2x_b_JH_FDB.pdb HJ BDF 0.044701 0.095801 -2.01238 -6.071510 -0.789906 0.051573 3q2x_b Templates
1 2m5n_3_JHFDB_IGECA.pdb BDFHJ ACEGI 5.677250 4.744780 -22.21540 -0.450673 0.000000 2.607290 2m5n_3 Templates
2 3fr1_AC_EGI.pdb CA GIE 0.110695 0.405026 -7.39247 -6.188010 -0.867155 0.028225 3fr1 Templates
3 2ona_BD_FHJ.pdb BD FHJ 0.204879 0.496664 -3.11506 -5.760990 0.000000 0.022129 2ona Templates
4 3dgj_1_BD_FHJ.pdb DB HJF 0.913331 0.249727 -4.73258 -4.370030 -1.285720 0.060052 3dgj_1 Templates
# Load results from Analyse complex for repaired fragments
ac_extracted = pd.read_csv("./results/foldx_analyse_complex_results_fragment_pairs.csv") # FIXME
ac_extracted['ID'] = ac_extracted['PDB_File'].str[:-4]
ac_extracted['Category'] = "Extracted"
## Replace original files of standardized files
ac_extracted = filter_non_standardized(ac_extracted)
ac_extracted.head()
PDB_File Group1 Group2 IntraclashesGroup1 IntraclashesGroup2 Interaction_Energy BackHBond SideHBond VDWClashes ID Category
0 6ufr_pair_196_Repair.pdb CGFJB ADEHI 0.450845 0.054773 -21.882800 -0.410710 0.000000e+00 0.002200 6ufr_pair_196_Repair Extracted
1 7e0f_pair_124_Repair.pdb DEFGH ABCIJ 4.111020 4.127980 -14.520100 -1.207550 0.000000e+00 1.108400 7e0f_pair_124_Repair Extracted
2 6xfm_pair_135_Repair.pdb 1A 234 0.026684 0.028753 -0.407182 -4.296730 -2.064610e+00 0.000000 6xfm_pair_135_Repair Extracted
3 6xyq_pair_344_Repair.pdb HJ BDF 0.307053 0.433553 -4.612390 -4.757160 0.000000e+00 0.017628 6xyq_pair_344_Repair Extracted
4 8q9g_pair_232_Repair.pdb ACEIJ BDFGH 1.864520 0.124207 -17.891600 -0.508668 1.776360e-15 0.001564 8q9g_pair_232_Repair Extracted
# Filter based on distance (take only what's in repaired_df)
ac_extracted = ac_extracted[ac_extracted["PDB_File"].isin(set(repaired_df["PDB_File"]))]
print(f"After filtering based on length, we have ~{len(ac_extracted)//3} structures ({len(ac_extracted)} rows)")
After filtering based on length, we have ~61940 structures (185820 rows)
combined_data = pd.concat([ac_templates, ac_extracted], ignore_index=True)

combined_data['Interaction Type'] = "Unknown"
combined_data.loc[combined_data['Group1'].str.len() == combined_data['Group2'].str.len(), 'Interaction Type'] = "Packing"
combined_data.loc[combined_data['Group1'].str.len() != combined_data['Group2'].str.len(), 'Interaction Type'] = "Stacking"
combined_data.head(10)
PDB_File Group1 Group2 IntraclashesGroup1 IntraclashesGroup2 Interaction_Energy BackHBond SideHBond VDWClashes ID Category Interaction Type
0 3q2x_b_JH_FDB.pdb HJ BDF 0.044701 0.095801 -2.01238 -6.071510 -0.789906 0.051573 3q2x_b Templates Stacking
1 2m5n_3_JHFDB_IGECA.pdb BDFHJ ACEGI 5.677250 4.744780 -22.21540 -0.450673 0.000000 2.607290 2m5n_3 Templates Packing
2 3fr1_AC_EGI.pdb CA GIE 0.110695 0.405026 -7.39247 -6.188010 -0.867155 0.028225 3fr1 Templates Stacking
3 2ona_BD_FHJ.pdb BD FHJ 0.204879 0.496664 -3.11506 -5.760990 0.000000 0.022129 2ona Templates Stacking
4 3dgj_1_BD_FHJ.pdb DB HJF 0.913331 0.249727 -4.73258 -4.370030 -1.285720 0.060052 3dgj_1 Templates Stacking
5 4r0w_b_1_IG_ECA.pdb GI CAE 0.991309 0.947355 -3.19053 -6.102070 0.000000 0.008595 4r0w_b_1 Templates Stacking
6 4xfo_JH_FDB.pdb HJ BDF 0.094704 0.382677 -5.00158 -6.254170 -1.028470 0.009694 4xfo Templates Stacking
7 4ril_a_3_IG_ECA.pdb GI CAE 0.752904 0.750379 -2.34154 -5.993340 -0.522533 0.000000 4ril_a_3 Templates Stacking
8 6cfh_4_ACEGI_BDFHJ.pdb CAGEI BDHFJ 0.233039 0.345059 -28.30400 -1.715980 0.000000 0.290562 6cfh_4 Templates Packing
9 3fr1_BD_FHJ.pdb DB HJF 0.127741 0.459395 -7.71799 -5.972650 -0.597861 0.118170 3fr1 Templates Stacking
# Sanity check
print(combined_data['Group1'].str.len().value_counts())
print(combined_data['Group2'].str.len().value_counts())
Group1
2    124224
5     62112
Name: count, dtype: int64
Group2
3    124224
5     62112
Name: count, dtype: int64
plt.figure(figsize=(10,6))
K = combined_data[combined_data['Interaction Type'] == "Packing"]

# Plot a comparison of Packing Energies between Templates and Extracted
sns.kdeplot(
    data=K, x="Interaction_Energy",
    hue="Category",
    fill=True, common_norm=False,
    palette={"Templates": "mediumorchid", "Extracted": "cornflowerblue"},
    alpha=0.5
)
plt.xlabel("Packing Interaction Energy")
plt.ylabel("Density")
plt.title(f"Packing Energy Interaction Comparison\nTemplates n={len(K[K['Category']=='Templates'])}, Extracted n={len(K[K['Category']=='Extracted'])}")

plt.axvline(0, color="lightgray", linewidth=2, linestyle="--")
plt.xlim(-50, 50)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_color('dimgray')
plt.gca().spines['bottom'].set_color('dimgray')
#plt.gca().set_yticks([0, 0.1, 0.2, 0.3, 0.4])
plt.grid(False)

plt.savefig("plots/packing_energy_kde.png", dpi=600, bbox_inches="tight")
plt.savefig("plots/packing_energy_kde.pdf", bbox_inches="tight")
plt.savefig("plots/packing_energy_kde.svg", bbox_inches="tight")
plt.show()
plt.figure(figsize=(10,6))
K = combined_data[combined_data['Interaction Type'] == "Stacking"]

# Plot a comparison of Packing Energies between Templates and Extracted
sns.kdeplot(
    data=K, x="Interaction_Energy",
    hue="Category",
    fill=True, common_norm=False,
    palette={"Templates": "mediumorchid", "Extracted": "cornflowerblue"},
    alpha=0.5
)
plt.xlabel("Stacking Interaction Energy")
plt.ylabel("Density")
plt.title(f"Stacking Energy Interaction Comparison\nTemplates n={len(K[K['Category']=='Templates'])}, Extracted n={len(K[K['Category']=='Extracted'])}")

plt.axvline(0, color="lightgray", linewidth=2, linestyle="--")
plt.xlim(-12.5, 12.5)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_color('dimgray')
plt.gca().spines['bottom'].set_color('dimgray')
plt.gca().set_yticks([0, 0.05, 0.1, 0.15, 0.2])
plt.grid(False)

plt.savefig("plots/stacking_energy_kde.png", dpi=600, bbox_inches="tight")
plt.savefig("plots/stacking_energy_kde.pdf", bbox_inches="tight")
plt.savefig("plots/stacking_energy_kde.svg", bbox_inches="tight")
plt.show()

Let's see if there's any correlation between packing and stacking energy.

combined_data.head()
K = combined_data.copy()

packing = (
    K[K["Interaction Type"].str.lower() == "packing"]
      .loc[:, ["ID", "Interaction_Energy", "Category"]]
      .rename(columns={"Interaction_Energy": "Packing_Energy"})
)

stacking = (
    K[K["Interaction Type"].str.lower() == "stacking"]
      .loc[:, ["ID", "Interaction_Energy"]]
      .rename(columns={"Interaction_Energy": "Stacking_Energy"})
)

merged = stacking.merge(packing, on="ID", how="inner")
merged.head()
ID Stacking_Energy Packing_Energy Category
0 3q2x_b -2.01238 -31.501000 Templates
1 3fr1 -7.39247 -21.395700 Templates
2 2ona -3.11506 -15.931600 Templates
3 3dgj_1 -4.73258 -22.033400 Templates
4 4r0w_b_1 -3.19053 -0.840478 Templates
plt.figure(figsize=(10,6))
K = merged[merged['Category'] == "Templates"]

# Plot the correlation between stacking and packing energy
sns.scatterplot(
    data=K,
    x="Packing_Energy",
    y="Stacking_Energy",
    hue="Category",
    # fill=True, common_norm=False,
    palette={"Templates": "mediumorchid", "Extracted": "cornflowerblue"},
    alpha=0.5
)
plt.xlabel("Packing")
plt.ylabel("Stacking")
plt.title(f"Packing Interaction Energy vs Stacking Interaction Energy")
corr = np.corrcoef(K['Packing_Energy'], K['Stacking_Energy'])[0, 1]
print("Pearson correlation:", corr)

#plt.axvline(0, color="lightgray", linewidth=2, linestyle="--")
plt.xlim(-50, 50)
plt.ylim(-15, 15)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_color('dimgray')
plt.gca().spines['bottom'].set_color('dimgray')
#plt.gca().set_yticks([0, 0.05, 0.1, 0.15, 0.2])
plt.grid(False)

plt.savefig("plots/packing_stacking_templates_scatter.png", dpi=600, bbox_inches="tight")
plt.savefig("plots/packing_stacking_templates_scatter.pdf", bbox_inches="tight")
plt.savefig("plots/packing_stacking_templates_scatter.svg", bbox_inches="tight")
plt.show()
Pearson correlation: 0.1850314465829147
plt.figure(figsize=(10,6))
K = merged[merged['Category'] == "Extracted"]

# Plot the correlation between stacking and packing energy
sns.scatterplot(
    data=K,
    x="Packing_Energy",
    y="Stacking_Energy",
    hue="Category",
    # fill=True, common_norm=False,
    palette={"Templates": "mediumorchid", "Extracted": "cornflowerblue"},
    s=10,
    alpha=0.1
)
plt.xlabel("Packing")
plt.ylabel("Stacking")
plt.title(f"Packing Interaction Energy vs Stacking Interaction Energy")
corr = np.corrcoef(K['Packing_Energy'], K['Stacking_Energy'])[0, 1]
print("Pearson correlation:", corr)

#plt.axvline(0, color="lightgray", linewidth=2, linestyle="--")
plt.xlim(-50, 50)
plt.ylim(-15, 15)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_color('dimgray')
plt.gca().spines['bottom'].set_color('dimgray')
#plt.gca().set_yticks([0, 0.05, 0.1, 0.15, 0.2])
plt.grid(False)

plt.savefig("plots/packing_stacking_extracted_scatter.png", dpi=600, bbox_inches="tight")
plt.savefig("plots/packing_stacking_extracted_scatter.pdf", bbox_inches="tight")
plt.savefig("plots/packing_stacking_extracted_scatter.svg", bbox_inches="tight")
plt.show()
Pearson correlation: 0.3159500790333334

Shape diversity

To compare the shapes from both datasets and compare them, we have computed two RMSD matrices and clustered in a hierarchical fashion (two-level clustering).

First level: Coordinates of the hexapeptide pairs were projected onto a plane (from 3D to 2D), from these, an RMSD matrix was computed using a QCP superimposer. This matrix was than used as a basis for clustering with a dendrogram (using a ward distance metric).

Second level: Full 3D coordinates were used to create yet another RMSD matrix (with modified QCP superimposer which, after superimposition of the whole structures, took the maximum of the subset's RMSDs of the two stacks). The clustering pipeline was as follows: take all the structures which belong to the same cluster in the First level, take the rows corresponding to them in Second level RMSD matrix and cluster them (again using the dendrogram with the ward distance metric).

In the case of 2D structures, we assess the cluster's shape validity by viewing the superimposed projected coordinates (in the first level). For the 3D case, we superimpose all the structures to its cluster center - we export this cluster in a .cif file, we plot the 2D shapes as well - by averaging across layers and then projecting onto a plane by taking the PCA components.

import warnings

warnings.filterwarnings("ignore", message="'force_all_finite' was renamed to 'ensure_all_finite'", category=FutureWarning)
warnings.filterwarnings("ignore", message="n_jobs value .* overridden to .* by setting random_state", category=UserWarning)
warnings.filterwarnings("ignore", message="'force_all_finite' was renamed to 'ensure_all_finite'", category=FutureWarning, module=r".*sklearn.*")
warnings.filterwarnings("ignore", message=r"n_jobs value .* overridden to .* by setting random_state", category=UserWarning, module=r".*umap.*")

def print_pcoa_explained_var(pcoa_results, num_components=2):
    # Extract the raw eigenvalues, taking only the first 'num_components'
    # pcoa_results.eigvals is a pandas Series, direct slicing works
    raw_eigenvalues = pcoa_results.eigvals[:num_components]
    explained_variance_ratio = pcoa_results.proportion_explained[:num_components]
    cumulative_explained_variance = explained_variance_ratio.cumsum()

    # You now have the other metrics stored in these variables for the top 10 components:
    print("\nRaw Eigenvalues:")
    print(raw_eigenvalues)
    print("\nExplained Variance Ratio:")
    print(explained_variance_ratio)
    print("\nCumulative Explained Variance:")
    print(cumulative_explained_variance)

# Shape analysis
def plot_pca_and_umap(pca, umap, hue, palette):
    # Create wide figure with two subplots
    fig, axes = plt.subplots(1, 2, figsize=(12, 6))  # 1 row, 2 columns
    # Get value counts
    hue_series = pd.Series(hue).astype(str)
    category_counts = hue_series.value_counts()
    hue_labels = {label: f"{label} (n={category_counts.get(label, 0)})" for label in category_counts.index}
    print(hue_labels)
    num_of_templates = category_counts.get(category_counts.index[1], 0)
    print(f"Number of templates used {num_of_templates}")
    hue_group1 = hue[:num_of_templates]
    hue_group2 = hue[num_of_templates:]

    # Plot PCA
    try:
        sns.scatterplot(x=pca[:, 0], y=pca[:, 1], hue=hue, palette=palette, s=20, ax=axes[0])
    except:
        print("Error, trying to plot a df")
        sns.scatterplot(data=pca[num_of_templates:], x='PCoA_1', y='PCoA_2', hue=hue_group2, palette=palette,s=20, ax=axes[0], alpha=0.3)
        sns.scatterplot(data=pca[:num_of_templates], x='PCoA_1', y='PCoA_2', hue=hue_group1, palette=palette,s=20, ax=axes[0], alpha=0.9)

    axes[0].set_title("PCoA of Reconstructed Fragments")
    axes[0].set_xlabel("PCoA 1")
    axes[0].set_ylabel("PCoA 2")
    handles, labels = axes[0].get_legend_handles_labels()
    new_labels = [hue_labels.get(label, label) for label in labels]
    axes[0].legend(handles, new_labels, title="Category")
    axes[0].spines['top'].set_visible(False)
    axes[0].spines['right'].set_visible(False)
    axes[0].spines['left'].set_color('dimgray')
    axes[0].spines['bottom'].set_color('dimgray')
    axes[0].grid(False)

    # Plot UMAP
    # Split the UMAP coordinates
    umap_group1 = umap[:num_of_templates]
    umap_group2 = umap[num_of_templates:]

    # Plot Group 2 (overrepresented) first
    sns.scatterplot(x=umap_group2[:, 0], y=umap_group2[:, 1], hue=hue_group2,
                    palette=palette, s=20, ax=axes[1], alpha=0.3)
    sns.scatterplot(x=umap_group1[:, 0], y=umap_group1[:, 1], hue=hue_group1,
                    palette=palette, s=20, ax=axes[1], alpha=0.9)

    # Adjust labels and legend
    axes[1].set_title("UMAP of Reconstructed Fragments")
    axes[1].set_xlabel("UMAP 1")
    axes[1].set_ylabel("UMAP 2")
    handles, labels = axes[1].get_legend_handles_labels()
    new_labels = [hue_labels.get(label, label) for label in labels]
    axes[1].legend(handles, new_labels, title="Category")
    axes[1].spines['top'].set_visible(False)
    axes[1].spines['right'].set_visible(False)
    axes[1].spines['left'].set_color('dimgray')
    axes[1].spines['bottom'].set_color('dimgray')
    axes[1].grid(False)
    # Adjust layout
    plt.tight_layout()
    plt.show()

def num_components_explaining(percent_of_variance, pcoa_results):
    explained_variance_sum = 0
    components_explaining = 0

    for i in range(0, pcoa_results.samples.shape[1]):
        explained_variance_sum += pcoa_results.proportion_explained.iloc[i]
        if explained_variance_sum >= 0.9:
            components_explaining = i + 1
            print(f"{percent_of_variance}% of variance is explained by the first {i+1} components")
            return components_explaining

def plot_umap(num_templates, pdb_files, umap_emb, save_path="plots/rmsd_umap"):
    umapca_df = pd.DataFrame({
    'PDB_File': pdb_files,
    'UMAP_1': umap_emb[:, 0],
    'UMAP_2': umap_emb[:, 1]
    })

    fig = plt.figure(figsize=(10, 6))
    sampled_K = umapca_df.iloc[num_templates:]
    template_K = umapca_df.iloc[:num_templates]
    n_sampled =  len(sampled_K) - sampled_K['UMAP_1'].isnull().sum()
    n_template = len(template_K) - template_K['UMAP_1'].isnull().sum()

    sns.scatterplot(data=sampled_K, x='UMAP_1', y='UMAP_2',
                    color='cornflowerblue', s=20, alpha=0.3, label=f'Sampled Fragments (n={n_sampled})')
    sns.scatterplot(data=template_K, x='UMAP_1', y='UMAP_2',
                    color='mediumorchid', s=20, alpha=0.9, label=f'Template Fragments (n={n_template})')

    plt.title('UMAP Projection of All-to-All RMSD Matrix')
    plt.xlabel('UMAP 1')
    plt.ylabel('UMAP 2')
    plt.legend(title='Fragment Type')
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['left'].set_color('dimgray')
    plt.gca().spines['bottom'].set_color('dimgray')
    plt.grid(False)

    # plt.xlim(-0,5, 5.5)
    # plt.gca().set_yticks([0, 0.3, 0.6])

    plt.tight_layout()

    plt.savefig(f"{save_path}.png", dpi=600, bbox_inches="tight")
    plt.savefig(f"{save_path}.pdf", bbox_inches="tight")
    plt.savefig(f"{save_path}.svg", bbox_inches="tight")
    plt.show()

def plot_dendrogram(Z, labels, num_clusters, t_distance, save_path="plots/rmsd_dendrogram"):
    fig = plt.figure(figsize=(20, 8))
    ax = fig.add_subplot(111)

    sch.dendrogram(
        Z,
        labels=labels,
        leaf_rotation=90,
        leaf_font_size=8,
        #truncate_mode='lastp',
        p=num_clusters,
        show_contracted=True,
        color_threshold=t_distance,
        above_threshold_color='black',
        ax=ax
    )
    ax.grid(False)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.gca().spines['left'].set_color('dimgray')
    ax.spines['bottom'].set_visible(False)
    ax.set_xticklabels([]) # [str(label) for label in range(1, num_clusters + 1)]

    plt.title('Hierarchical Clustering Dendrogram with Clusters Highlighted')
    plt.xlabel('')
    plt.ylabel('Distance (Ward)')
    plt.tight_layout() # Adjust layout to prevent labels from overlapping

    plt.savefig(f"{save_path}.png", dpi=600, bbox_inches="tight")
    plt.savefig(f"{save_path}.pdf", bbox_inches="tight")
    plt.savefig(f"{save_path}.svg", bbox_inches="tight")
    plt.show()
def load_features(path):
    # Load precomputed features
    data = np.load(path, allow_pickle=True)
    return data["X"], data["files"]

def load_matrix(path):
    print("Loading an RMSD matrix...")
    X, pdb_files = load_features(path)
    print(f"Loaded. Matrix has {X.shape} shape.")
    print(f"With corresponding number of files {len(pdb_files)}")

    # Get the NaN values counts
    nan_count_per_row = np.isnan(X).sum(axis=1)
    nan_count_counts = pd.Series(nan_count_per_row).value_counts()
    print(nan_count_counts)

    # Find indices of files which failed
    nan_count_to_remove = max(nan_count_per_row)
    if nan_count_to_remove > 0:
        indices_to_remove = [idx for idx, count in enumerate(nan_count_per_row) if count == nan_count_to_remove]
        print(f"Files to remove due to high NaN count ({nan_count_to_remove} nans)")
    else:
        indices_to_remove = []

    # Remove these rows and columns
    print(f"Removing {len(indices_to_remove)} files with {nan_count_to_remove} NaN values in their respective row.")
    keep_indices = [i for i in range(len(pdb_files)) if i not in indices_to_remove]
    X = X[np.ix_(keep_indices, keep_indices)]
    print(f"Resulting matrix shape is {X.shape}")

    # Update the pdb_files list
    pdb_files = [pdb_files[i] for i in range(len(pdb_files)) if i not in indices_to_remove]
    print(f"Adjusted number of files {len(pdb_files)}")

    return X, pdb_files

def reduce_matrix(X, files, max_samples=5_000, float_32=True):
    # Get a subset so we can handle it computationally
    if X.shape[0] > max_samples:
        print(f"Reducing matrix size to the first {max_samples} samples")
        selected = np.arange(max_samples)
        X = X[np.ix_(selected, selected)]
        files = files[:max_samples]

    if float_32:
        print("Converting to float32")
        X = X.astype(np.float32) # and normalize if necessary

    return X, files

def filter_matrix(X, files, wanted_files):
    filter_indices = []
    not_found_files = []
    for filtered_file in wanted_files:
        try:
            filter_indices.append(files.index(filtered_file))
        except ValueError:
            not_found_files.append(filtered_file)
    print(f"{len(not_found_files)} were not found in the pdb_files_projected matrix")
    print(np.array(filter_indices).shape)

    X = X[np.ix_(filter_indices, filter_indices)]
    files = np.asarray(files)[filter_indices].tolist()

    return X, files
# Load and reduce projected coordinates "2D-RMSD" matrix
rmsd_X_projected, pdb_files_projected = load_matrix("rmsd_matrix_projected.npz")
# Filter on our new condition
helper_templates = pd.DataFrame([{"PDB_File": file} for file in pdb_files_projected[:170]])
temporary = pd.concat([helper_templates, filter_helper])
temporary.reset_index()
temporary
#rmsd_X_projected, pdb_files_projected = reduce_matrix(rmsd_X_projected, pdb_files_projected)
rmsd_X_projected, pdb_files_projected = filter_matrix(rmsd_X_projected, pdb_files_projected, temporary["PDB_File"].tolist())
Loading an RMSD matrix...
Loaded. Matrix has (35969, 35969) shape.
With corresponding number of files 35969
0    35969
Name: count, dtype: int64
Removing 0 files with 0 NaN values in their respective row.
Resulting matrix shape is (35969, 35969)
Adjusted number of files 35969
16 were not found in the pdb_files_projected matrix
(12458,)
plt.figure(figsize=(5,3))

sns.kdeplot(
    rmsd_X_projected[np.triu_indices_from(rmsd_X_projected, k=1)].flatten(),
    color = "midnightblue",
    linewidth=2,
)

plt.xlim(-0.5, 5.5)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_color('dimgray')
plt.gca().spines['bottom'].set_color('dimgray')
plt.gca().set_yticks([0, 0.3, 0.6])
plt.gca().set_xticks([0, 1, 2, 3, 4, 5])
plt.grid(False)

plt.savefig("plots/rmsd_projected_kde.png", dpi=600, bbox_inches="tight")
plt.savefig("plots/rmsd_projected_kde.pdf", bbox_inches="tight")
plt.savefig("plots/rmsd_projected_kde.svg", bbox_inches="tight")
plt.show()
# Load and reduce "3D-RMSD" matrix with max_stack superimposer
rmsd_X_max, pdb_files_max = load_matrix("rmsd_matrix_max_stack.npz")
# rmsd_X_max, pdb_files_max = reduce_matrix(rmsd_X_max, pdb_files_max)
pdb_files_max = [file[17:] for file in pdb_files_max]

rmsd_X_max, pdb_files_max = filter_matrix(rmsd_X_max, pdb_files_max, temporary["PDB_File"].tolist())
Loading an RMSD matrix...
Loaded. Matrix has (36047, 36047) shape.
With corresponding number of files 36047
78       35969
36046       78
Name: count, dtype: int64
Files to remove due to high NaN count (36046 nans)
Removing 78 files with 36046 NaN values in their respective row.
Resulting matrix shape is (35969, 35969)
Adjusted number of files 35969
16 were not found in the pdb_files_projected matrix
(12458,)
plt.figure(figsize=(5,3))

sns.kdeplot(
    rmsd_X_max[np.triu_indices_from(rmsd_X_max, k=1)].flatten(),
    color = "midnightblue",
    linewidth=2,
)

plt.xlim(-0.5, 16)
plt.ylim(0, 0.4)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_color('dimgray')
plt.gca().spines['bottom'].set_color('dimgray')
plt.gca().set_yticks([0, 0.2, 0.4])
plt.gca().set_xticks([0, 5, 10, 15])
plt.grid(False)

plt.savefig("plots/rmsd_max_kde.png", dpi=600, bbox_inches="tight")
plt.savefig("plots/rmsd_max_kde.pdf", bbox_inches="tight")
plt.savefig("plots/rmsd_max_kde.svg", bbox_inches="tight")
plt.show()

First level

distance_matrix = DistanceMatrix(rmsd_X_projected)

# Perform PCoA
num_top_components = 2
pcoa_results = pcoa(distance_matrix, number_of_dimensions=400)
results_df = pcoa_results.samples.iloc[:, :num_top_components].copy()
## Create column names for the DataFrame based on the number of top components
column_names = [f'PCoA_{i+1}' for i in range(num_top_components)]
results_df.columns = column_names
print_pcoa_explained_var(pcoa_results, num_top_components)

# UMAP
## Initialize UMAP with 'precomputed' metric
umap_model = umap.UMAP(n_neighbors=30, n_components=2, metric='precomputed', random_state=42, min_dist=0.05)
emb = umap_model.fit_transform(rmsd_X_projected)

num_templates_projected=170
category = np.array(
    ["Templates"] * num_templates_projected + ["Extracted"] * (len(pdb_files_projected) - num_templates_projected)
)

plot_pca_and_umap(results_df, emb, category, {"Templates": "mediumorchid", "Extracted": "cornflowerblue"})
Raw Eigenvalues:
PC1    13451.882343
PC2     6343.336759
dtype: float64

Explained Variance Ratio:
PC1    0.235486
PC2    0.111045
dtype: float64

Cumulative Explained Variance:
PC1    0.235486
PC2    0.346531
dtype: float64
{'Extracted': 'Extracted (n=12288)', 'Templates': 'Templates (n=170)'}
Number of templates used 170
Error, trying to plot a df
components_explaining = num_components_explaining(percent_of_variance=90, pcoa_results=pcoa_results)
umapca_model = umap.UMAP(n_neighbors=30, n_components=2, random_state=42, min_dist=0.1, spread=1)
umapca_emb = umapca_model.fit_transform(pcoa_results.samples.iloc[:, :components_explaining].values)

plot_umap(num_templates_projected, pdb_files_projected, umapca_emb, save_path="plots/rmsd_projected_umap")
90% of variance is explained by the first 266 components
# Perform hierarchical clustering
t_distance = 25
Z = sch.linkage(squareform(rmsd_X_projected), method='ward', metric='precomputed')

labels = sch.fcluster(Z, t=t_distance, criterion='distance')

num_clusters = len(np.unique(labels))
print(f"Found {num_clusters} number of clusters... labeled {len(labels)} structures")

custom_cluster_palette = [ # set tab10 cluster colors to match the dendrogram
'tab:orange', 'tab:green', 'tab:red',
'tab:purple', 'tab:brown', 'tab:pink',
'tab:gray', 'tab:olive', 'tab:cyan'#, 'tab:blue'
]
plot_dendrogram(Z, pdb_files_projected, num_clusters, t_distance, save_path="plots/rmsd_projected_dendrogram")
Found 21 number of clusters... labeled 12458 structures
import superpose_fragments
import projected_pairwise_rmsd_matrix
from Bio.PDB.qcprot import QCPSuperimposer

pdb_to_idx = {file: idx for idx, file in enumerate(pdb_files_projected)}

def calculate_summed_distance_within_cluster(row):
    current_pdb = row['PDB_File']
    current_cluster = row['Cluster']

    # Get all PDB files in the current cluster
    cluster_members_df = K[K['Cluster'] == current_cluster]
    cluster_member_pdbs = cluster_members_df['PDB_File'].tolist()

    # Get the index of the current and other cluster members' PDB files in the RMSD matrix
    current_pdb_idx = pdb_to_idx[current_pdb]
    cluster_member_indices = [pdb_to_idx[pdb] for pdb in cluster_member_pdbs]

    # Extract the relevant row from the RMSD matrix (distances from current PDB)
    distances_from_current_pdb = rmsd_X_projected[current_pdb_idx, :] # FIXME
    summed_distance = distances_from_current_pdb[cluster_member_indices].sum()
    # summed_distance_squared = (distances_from_current_pdb[cluster_member_indices]**2).sum() # TODO
    # if i want to know which files distort the cluster
    return summed_distance

def superpose_cluster(cluster_num, distance_matrix, projected_alignment=False, K_labeled=K):
    # Get the indices of the PDB files in the target cluster
    cluster_rows = K_labeled[K_labeled['Cluster'] == cluster_num].sort_values(by='Summed_Distances_Within_Cluster', ascending=True)
    cluster_indices = cluster_rows['PDB_File'].apply(lambda x: pdb_to_idx[x]).tolist()
    cluster_rmsds_submatrix = distance_matrix[np.ix_(cluster_indices, cluster_indices)]
    # Flatten the upper triangular part
    upper_triangle_indices = np.triu_indices_from(cluster_rmsds_submatrix, k=1) # FIXME: can be rectangular not a square
    all_pairwise_cluster_distances = cluster_rmsds_submatrix[upper_triangle_indices]
    # Get all the cluster paths and superpose
    paths = []
    for file in list(cluster_rows['PDB_File'].values):
        file = file.split('/')[-1]
        if file.split('_')[-1] == "Repair.pdb":
            paths.append(f"./fragment_pairs/{file}")
        else:
            paths.append(f"./templates/{file}")
    if not projected_alignment:
        rmsds, super_opts, aligned_coords = superpose_fragments.superpose_all(paths, output_path=f"superposed_cluster_{cluster_num}.cif")
    else:
        center_projected_coords, _ = projected_pairwise_rmsd_matrix.get_projected_coords_and_std(paths[0])
        superimposer = QCPSuperimposer()
        aligned_coords = [center_projected_coords]
        rmsds = []

        for path in paths[1:]:
            projected_coords, _ = projected_pairwise_rmsd_matrix.get_projected_coords_and_std(path)
            if projected_coords is None: # FIXME: why some templates failing
                print(f"{projected_coords} of {path}")
                continue
            
            possible_aligns = projected_pairwise_rmsd_matrix.generate_possible_alignments_test(projected_coords)

            best_rmsd = float('inf')
            best_variant = None
            for align in possible_aligns:
                superimposer.set(center_projected_coords, align)
                superimposer.run()
                rmsd = superimposer.get_rms()
                
                if rmsd < best_rmsd:
                    best_rmsd = rmsd
                    best_variant = superimposer.get_transformed()

            rmsds.append(best_rmsd)
            aligned_coords.append(best_variant)

    print(f"Number of structures in cluster {cluster_num}: {len(cluster_rows)}")
    print(f"Mean RMSD within cluster: {np.mean(all_pairwise_cluster_distances):.2f} Å")
    print(f"Standard deviation of RMSDs within cluster: {np.std(all_pairwise_cluster_distances):.2f} Å")
    print(f"Standard deviation of RMSDs superposition to centroid structure: {np.std(rmsds):.2f} Å")
    return aligned_coords

def plot_projected_cluster(cluster_num, projected_alignment=True, X=rmsd_X_projected, K_labeled=K, save_path="plots/cluster_test"):
    aligned_coords = superpose_cluster(cluster_num, X, projected_alignment=projected_alignment, K_labeled=K_labeled)
    aligned_coords = np.array(aligned_coords)
    avg_layer_coords = aligned_coords.reshape(-1, 12, 3)
    mean_coords = np.mean(avg_layer_coords, axis=0)
    std_coords = np.std(avg_layer_coords, axis=0)

    centered_mean_coords = mean_coords - np.mean(mean_coords, axis=0)
    pca = PCA(n_components=2)
    pca.fit(centered_mean_coords)
    projected_coords = pca.transform(centered_mean_coords)

    print(f"Total variance explained by best-fit plane: ~{np.sum(pca.explained_variance_ratio_):.2f}")
    print(f"Variance explained by PC1: {pca.explained_variance_ratio_[0]:.2f}")
    print(f"Variance explained by PC2: {pca.explained_variance_ratio_[1]:.2f}")

    projected_stdev_x = np.dot(std_coords, pca.components_[0]) # Project stdev along PC1
    projected_stdev_y = np.dot(std_coords, pca.components_[1]) # Project stdev along PC2
    projected_stdev_x_abs = np.abs(projected_stdev_x)
    projected_stdev_y_abs = np.abs(projected_stdev_y)

    fig_2d_pca = plt.figure(figsize=(6, 4))
    ax_2d_pca = fig_2d_pca.add_subplot(111)

    # Plot the mean trace on the best-fit planeplot_projected_cluster
    ax_2d_pca.plot(projected_coords[:6, 0], 
                    projected_coords[:6, 1], 
                    'o-', label='Mean Trace (Hexapeptide 1)', color='black', linewidth=10, markersize=6)
    ax_2d_pca.plot(projected_coords[6:, 0], 
                    projected_coords[6:, 1], 
                    'o-', label='Mean Trace (Hexapeptide 2)', color='black', linewidth=10, markersize=6)
    # Add "blurriness" as error bars
    # Use projected_stdev_x_abs and projected_stdev_y_abs for error bars along the new axes
    ax_2d_pca.errorbar(projected_coords[:, 0], projected_coords[:, 1],
                       xerr=projected_stdev_x_abs, yerr=projected_stdev_y_abs,
                       fmt='none', capsize=3, color='grey', alpha=0.5, zorder=0)

    ax_2d_pca.axes.get_xaxis().set_visible(False)
    ax_2d_pca.axes.get_yaxis().set_visible(False)
    # remove the box
    for spine in ax_2d_pca.spines.values():
        spine.set_visible(False)
    ax_2d_pca.grid(False)
    ax_2d_pca.set_aspect('equal', adjustable='box') # Keep aspect ratio for true shape
    plt.tight_layout()

    plt.savefig(f"{save_path}_{cluster_num}.png", dpi=300, bbox_inches="tight")
    plt.savefig(f"{save_path}_{cluster_num}.pdf", bbox_inches="tight")
    plt.savefig(f"{save_path}_{cluster_num}.svg", bbox_inches="tight")
    plt.show()

K = pd.DataFrame({
    'PDB_File': pdb_files_projected,
    'Cluster': labels,
    'Category': category
})

# Apply the function to each row
K['Summed_Distances_Within_Cluster'] = K.apply(calculate_summed_distance_within_cluster, axis=1)
cluster_num = 11
# superpose_cluster(cluster_num, rmsd_X_projected)
plot_projected_cluster(cluster_num, True, rmsd_X_projected, K)
Number of structures in cluster 11: 309
Mean RMSD within cluster: 2.67 Å
Standard deviation of RMSDs within cluster: 0.59 Å
Standard deviation of RMSDs superposition to centroid structure: 0.61 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.84
Variance explained by PC2: 0.16
for i in range(max(labels)):
    plot_projected_cluster(i+1, True, rmsd_X_projected, K, save_path="plots/projected_clusters/projected_cluster")
Number of structures in cluster 1: 370
Mean RMSD within cluster: 1.96 Å
Standard deviation of RMSDs within cluster: 0.63 Å
Standard deviation of RMSDs superposition to centroid structure: 0.56 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.98
Variance explained by PC2: 0.02
Number of structures in cluster 2: 612
Mean RMSD within cluster: 1.66 Å
Standard deviation of RMSDs within cluster: 0.46 Å
Standard deviation of RMSDs superposition to centroid structure: 0.49 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.95
Variance explained by PC2: 0.05
Number of structures in cluster 3: 433
Mean RMSD within cluster: 1.89 Å
Standard deviation of RMSDs within cluster: 0.48 Å
Standard deviation of RMSDs superposition to centroid structure: 0.46 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.88
Variance explained by PC2: 0.12
Number of structures in cluster 4: 591
Mean RMSD within cluster: 2.28 Å
Standard deviation of RMSDs within cluster: 0.53 Å
Standard deviation of RMSDs superposition to centroid structure: 0.52 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.92
Variance explained by PC2: 0.08
Number of structures in cluster 5: 360
Mean RMSD within cluster: 1.70 Å
Standard deviation of RMSDs within cluster: 0.48 Å
Standard deviation of RMSDs superposition to centroid structure: 0.47 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.80
Variance explained by PC2: 0.20
Number of structures in cluster 6: 868
Mean RMSD within cluster: 2.27 Å
Standard deviation of RMSDs within cluster: 0.51 Å
Standard deviation of RMSDs superposition to centroid structure: 0.42 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.78
Variance explained by PC2: 0.22
Number of structures in cluster 7: 469
Mean RMSD within cluster: 1.31 Å
Standard deviation of RMSDs within cluster: 0.46 Å
Standard deviation of RMSDs superposition to centroid structure: 0.36 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.79
Variance explained by PC2: 0.21
Number of structures in cluster 8: 468
Mean RMSD within cluster: 1.51 Å
Standard deviation of RMSDs within cluster: 0.47 Å
Standard deviation of RMSDs superposition to centroid structure: 0.47 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.93
Variance explained by PC2: 0.07
Number of structures in cluster 9: 500
Mean RMSD within cluster: 1.49 Å
Standard deviation of RMSDs within cluster: 0.44 Å
Standard deviation of RMSDs superposition to centroid structure: 0.36 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.82
Variance explained by PC2: 0.18
Number of structures in cluster 10: 776
Mean RMSD within cluster: 2.21 Å
Standard deviation of RMSDs within cluster: 0.50 Å
Standard deviation of RMSDs superposition to centroid structure: 0.57 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.82
Variance explained by PC2: 0.18
Number of structures in cluster 11: 309
Mean RMSD within cluster: 2.67 Å
Standard deviation of RMSDs within cluster: 0.59 Å
Standard deviation of RMSDs superposition to centroid structure: 0.61 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.84
Variance explained by PC2: 0.16
Number of structures in cluster 12: 453
Mean RMSD within cluster: 1.90 Å
Standard deviation of RMSDs within cluster: 0.55 Å
Standard deviation of RMSDs superposition to centroid structure: 0.49 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.65
Variance explained by PC2: 0.35
Number of structures in cluster 13: 1168
Mean RMSD within cluster: 2.18 Å
Standard deviation of RMSDs within cluster: 0.45 Å
Standard deviation of RMSDs superposition to centroid structure: 0.47 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.55
Variance explained by PC2: 0.45
Number of structures in cluster 14: 408
Mean RMSD within cluster: 1.96 Å
Standard deviation of RMSDs within cluster: 0.53 Å
Standard deviation of RMSDs superposition to centroid structure: 0.49 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.60
Variance explained by PC2: 0.40
Number of structures in cluster 15: 611
Mean RMSD within cluster: 1.91 Å
Standard deviation of RMSDs within cluster: 0.53 Å
Standard deviation of RMSDs superposition to centroid structure: 0.54 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.63
Variance explained by PC2: 0.37
Number of structures in cluster 16: 366
Mean RMSD within cluster: 1.15 Å
Standard deviation of RMSDs within cluster: 0.36 Å
Standard deviation of RMSDs superposition to centroid structure: 0.31 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.71
Variance explained by PC2: 0.29
Expected 60 atoms, got 0 atoms in ./fragment_pairs/7ncj_pair_009_Repair.pdb
None of ./fragment_pairs/7ncj_pair_009_Repair.pdb
Number of structures in cluster 17: 857
Mean RMSD within cluster: 1.58 Å
Standard deviation of RMSDs within cluster: 0.50 Å
Standard deviation of RMSDs superposition to centroid structure: 0.50 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.79
Variance explained by PC2: 0.21
Number of structures in cluster 18: 581
Mean RMSD within cluster: 1.46 Å
Standard deviation of RMSDs within cluster: 0.52 Å
Standard deviation of RMSDs superposition to centroid structure: 0.53 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.73
Variance explained by PC2: 0.27
Number of structures in cluster 19: 451
Mean RMSD within cluster: 1.58 Å
Standard deviation of RMSDs within cluster: 0.51 Å
Standard deviation of RMSDs superposition to centroid structure: 0.48 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.68
Variance explained by PC2: 0.32
Number of structures in cluster 20: 875
Mean RMSD within cluster: 1.39 Å
Standard deviation of RMSDs within cluster: 0.39 Å
Standard deviation of RMSDs superposition to centroid structure: 0.26 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.57
Variance explained by PC2: 0.43
Number of structures in cluster 21: 932
Mean RMSD within cluster: 1.76 Å
Standard deviation of RMSDs within cluster: 0.60 Å
Standard deviation of RMSDs superposition to centroid structure: 0.63 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.60
Variance explained by PC2: 0.40
def plot_labeled_umap(df):
    fig = plt.figure(figsize=(10, 6))

    sns.scatterplot(data=df, x='UMAP_1', y='UMAP_2',
                    hue='Cluster', s=20, alpha=1, palette=custom_cluster_palette)

    plt.title('UMAP Projection of Sampled and Template Fragments')
    plt.xlabel('UMAP 1')
    plt.ylabel('UMAP 2')
    plt.legend(title='Clusters')
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.grid(False)
    #plt.ylim(-0.5, 7)
    plt.gca().spines['left'].set_color('dimgray')
    plt.gca().spines['bottom'].set_color('dimgray')
    #plt.gca().set_yticks([0, 2, 4, 6])

    plt.tight_layout()
    plt.savefig("plots/rmsd_projected_labeled_umap.png", dpi=600, bbox_inches="tight")
    plt.savefig("plots/rmsd_projected_labeled_umap.pdf", bbox_inches="tight")
    plt.savefig("plots/rmsd_projected_labeled_umap.svg", bbox_inches="tight")
    plt.show()

umapca_df = pd.DataFrame({
'PDB_File': pdb_files_projected,
'UMAP_1': umapca_emb[:, 0],
'UMAP_2': umapca_emb[:, 1]
})
K_merged = pd.merge(K, umapca_df, on="PDB_File")
plot_labeled_umap(K_merged)
unique_cluster_names = K['Cluster'].unique()
cluster_order_numeric_sorted_str = sorted(unique_cluster_names, key=int)
K['Cluster'] = pd.Categorical(K['Cluster'], categories=cluster_order_numeric_sorted_str, ordered=True)

# Calculate value counts based on the ordered categorical 'Cluster'
# This will now respect the order defined in the categorical type
value_counts_ordered = K['Cluster'].value_counts().sort_index()
#print([f"Cluster {i+1}, {val}" for i, val in enumerate(value_counts_ordered.values)])
# Prepare data for "Template Fragments"
stacked_counts = K.groupby(['Cluster', 'Category']).size().reset_index(name='counts')
stacked_counts = stacked_counts[stacked_counts['Category'] == "Templates"]
#print([f"Cluster {i+1}, {val}" for i, val in enumerate(stacked_counts['counts'].values)])

plt.figure(figsize=(25, 2))
# Plot the total cluster sizes, now ordered by cluster number
ax = sns.barplot(x=value_counts_ordered.index, y=value_counts_ordered.values, color='midnightblue')
# Plot the "Template Fragments" counts, which will also be ordered by cluster number
#ax = sns.barplot(x='Cluster', y='counts', color='mediumorchid', data=stacked_counts) # TODO: purple distribution (kde)
tmp = stacked_counts.set_index('Cluster')['counts']
x_template = np.repeat([i-1 for i in tmp.index.values], tmp.values)
x_template_num = pd.Series(x_template).astype("int64").to_numpy(dtype=float)

# Match x-limits to the clusters
xmin, xmax = value_counts_ordered.index.min(), value_counts_ordered.index.max()
ax.set_xlim(-1, xmax)

# --- KDE on a secondary y-axis ---
ax2 = ax.twinx()
sns.kdeplot(
    x=x_template_num, ax=ax2, fill=True, alpha=0.3,
    bw_adjust=0.1, color='mediumorchid', clip=(xmin - 2, xmax + 2)
)

plt.xlabel('Cluster')
plt.ylabel('Count')
ax.grid(False)
ax2.grid(False)
# remove spines
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_color('dimgray')
ax2.spines['left'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['right'].set_color('dimgray')
ax2.set_ylabel('Density')
plt.xticks(rotation=90)
plt.tight_layout()

plt.savefig("plots/cluster_counts_projected_bar.png", dpi=600, bbox_inches="tight")
plt.savefig("plots/cluster_counts_projected_bar.pdf", bbox_inches="tight")
plt.savefig("plots/cluster_counts_projected_bar.svg", bbox_inches="tight")
plt.show()
pdb_to_idx = {file: idx for idx, file in enumerate(pdb_files_projected)}
rmsd_plotting_df = []

for cluster in cluster_order_numeric_sorted_str:
    cluster_rows = K[K['Cluster'] == cluster]
    cluster_indices = cluster_rows['PDB_File'].apply(lambda x: pdb_to_idx[x]).tolist()
    cluster_rmsds_submatrix = rmsd_X_projected[np.ix_(cluster_indices, cluster_indices)]
    # Flatten the upper triangular part
    upper_triangle_indices = np.triu_indices_from(cluster_rmsds_submatrix, k=1)
    all_pairwise_cluster_distances = cluster_rmsds_submatrix[upper_triangle_indices]

    rmsds_test=[]
    for dist in all_pairwise_cluster_distances:
        if dist == 0:
            continue
        rmsds_test.append(dist)
        rmsd_plotting_df.append({'Cluster': cluster, 'RMSD': dist})

rmsd_plotting_df = pd.DataFrame(rmsd_plotting_df)

plt.figure(figsize=(10, 6))
sns.violinplot(
    x='Cluster',
    y='RMSD',
    data=rmsd_plotting_df,
    palette=custom_cluster_palette,
    inner='quartile',
    width=0.7,
    cut=0,
)

plt.xlabel('Cluster', fontsize=12)
plt.ylabel('Pairwise RMSD', fontsize=12)
plt.title('Distribution of Pairwise RMSDs within Clusters', fontsize=14)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

plt.ylim(-0.5, 7)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_color('dimgray')
plt.gca().spines['bottom'].set_visible(False)
plt.gca().set_yticks([0, 2, 4, 6])

plt.grid(axis='y', linestyle='--', alpha=0.7) #, linewidth=2)
plt.tight_layout()

plt.savefig("plots/rmsd_projected_within_cluster_violin.png", dpi=600, bbox_inches="tight")
plt.savefig("plots/rmsd_projected_within_cluster_violin.pdf", bbox_inches="tight")
plt.savefig("plots/rmsd_projected_within_cluster_violin.svg", bbox_inches="tight")
plt.show()

Second level

distance_matrix = DistanceMatrix(rmsd_X_max) # FIXME: MAX_STACK SUPERIMPOSER DOESNT GIVE US A `DISTANCE` MATRIX

# Perform PCoA
num_top_components = 2
pcoa_results = pcoa(distance_matrix, number_of_dimensions=3000)
results_df = pcoa_results.samples.iloc[:, :num_top_components].copy()
## Create column names for the DataFrame based on the number of top components
column_names = [f'PCoA_{i+1}' for i in range(num_top_components)]
results_df.columns = column_names
print_pcoa_explained_var(pcoa_results, num_top_components)


# UMAP
## Initialize UMAP with 'precomputed' metric
umap_model = umap.UMAP(n_neighbors=30, n_components=2, metric='precomputed', random_state=42)
emb = umap_model.fit_transform(rmsd_X_max)

num_templates_max = 170
category_spatial = np.array(
    ["Templates"] * num_templates_max + ["Extracted"] * (len(pdb_files_max) - num_templates_max)
)

plot_pca_and_umap(results_df, emb, category, {"Templates": "mediumorchid", "Extracted": "cornflowerblue"})
Raw Eigenvalues:
PC1    34424.583235
PC2    20307.085038
dtype: float64

Explained Variance Ratio:
PC1    0.049221
PC2    0.029035
dtype: float64

Cumulative Explained Variance:
PC1    0.049221
PC2    0.078256
dtype: float64
{'Extracted': 'Extracted (n=12288)', 'Templates': 'Templates (n=170)'}
Number of templates used 170
Error, trying to plot a df
components_explaining = num_components_explaining(percent_of_variance=90, pcoa_results=pcoa_results)
umapca_model = umap.UMAP(n_neighbors=15, n_components=2, random_state=42, min_dist=0.1)
umapca_emb = umapca_model.fit_transform(pcoa_results.samples.iloc[:, :components_explaining].values)

plot_umap(num_templates_max, pdb_files_max, umapca_emb, save_path="plots/rmsd_max_umap")
90% of variance is explained by the first 2765 components
pdb_to_idx = {file: idx for idx, file in enumerate(pdb_files_max)}

def calculate_summed_distance_within_cluster_spatial(row):
    current_pdb = row['PDB_File']
    current_cluster = row['Cluster']

    # Get all PDB files in the current cluster
    cluster_members_df = K_spatial[K_spatial['Cluster'] == current_cluster]
    cluster_member_pdbs = cluster_members_df['PDB_File'].tolist()

    # Get the index of the current and other cluster members' PDB files in the RMSD matrix
    current_pdb_idx = pdb_to_idx[current_pdb]
    cluster_member_indices = [pdb_to_idx[pdb] for pdb in cluster_member_pdbs]

    # Extract the relevant row from the RMSD matrix (distances from current PDB)
    distances_from_current_pdb = rmsd_X_projected[current_pdb_idx, :] # FIXME
    summed_distance = distances_from_current_pdb[cluster_member_indices].sum()
    # summed_distance_squared = (distances_from_current_pdb[cluster_member_indices]**2).sum() # TODO
    # if i want to know which files distort the cluster
    return summed_distance

# Perform hierarchical clustering
t_distance_spatial = 25
Z_spatial = sch.linkage(squareform(rmsd_X_max), method='ward', metric='precomputed')

labels_spatial = sch.fcluster(Z_spatial, t=t_distance_spatial, criterion='distance')

num_clusters_spatial = len(np.unique(labels_spatial))
print(f"Found {num_clusters_spatial} number of clusters... labeled {len(labels_spatial)} structures")

K_spatial = pd.DataFrame({
    'PDB_File': pdb_files_max,
    'Cluster': labels_spatial,
    'Category': category_spatial
})
# Apply the function to each row
K_spatial['Summed_Distances_Within_Cluster'] = K_spatial.apply(calculate_summed_distance_within_cluster_spatial, axis=1)

plot_dendrogram(Z_spatial, pdb_files_max, num_clusters_spatial, t_distance_spatial, save_path="plots/rmsd_max_dendrogram")
Found 38 number of clusters... labeled 12458 structures
def calculate_summed_distance_within_cluster_spatial_sub(row):
    current_pdb = row['PDB_File']
    current_cluster = row['Cluster']

    # Get all PDB files in the current cluster
    cluster_members_df = K_hier[K_hier['Cluster'] == current_cluster]
    cluster_member_pdbs = cluster_members_df['PDB_File'].tolist()

    # Get the index of the current and other cluster members' PDB files in the RMSD matrix
    current_pdb_idx = pdb_to_idx[current_pdb]
    cluster_member_indices = [pdb_to_idx[pdb] for pdb in cluster_member_pdbs]

    # Extract the relevant row from the RMSD matrix (distances from current PDB)
    distances_from_current_pdb = rmsd_X_projected[current_pdb_idx, :] # FIXME
    summed_distance = distances_from_current_pdb[cluster_member_indices].sum()
    # summed_distance_squared = (distances_from_current_pdb[cluster_member_indices]**2).sum() # TODO
    # if i want to know which files distort the cluster
    return summed_distance

def hierarchical_dendrogram(t_dist, cluster_num, cluster_df, pdb_files, X):
    # Perform hierarchical clustering and return sub-cluster objects    
    if cluster_num > cluster_df['Cluster'].max() or cluster_num <= 0:
        print(f"Pick correct cluster number... not {cluster_num}")
        return
    
    sub_cluster_df = cluster_df[cluster_df['Cluster'] == cluster_num]
    query_files = sub_cluster_df['PDB_File'].to_numpy()
    pdb_files = np.asarray([file.split('/')[-1] for file in pdb_files])
    
    X_sub = X.copy()
    mask = np.isin(pdb_files, query_files)
    selected = np.flatnonzero(mask) 
    selected_files = pdb_files[selected]

    X_sub = X_sub[np.ix_(selected, selected)]

    Z_sub = sch.linkage(squareform(X_sub), method='ward', metric='precomputed')
    labels_sub = sch.fcluster(Z_sub, t=t_dist, criterion='distance')

    num_clusters_sub = len(np.unique(labels_sub))
    print(f"Found {num_clusters_sub} number of clusters... labeled {len(labels_sub)} structures")

    plot_dendrogram(Z_sub, selected_files, num_clusters_sub, t_dist, save_path=f"plots/rmsd_max_of_projected_cluster_{cluster_num}_dendrogram")

    K_hier = pd.DataFrame({
        'PDB_File': selected_files,
        'Cluster': labels_sub,
    })
    return K_hier, X_sub

# Get a clustering of subcluster number 11 with ward threshold 10
subcluster_num = 11
K_hier, X_sub = hierarchical_dendrogram(10, subcluster_num, K, pdb_files_max, rmsd_X_max)
pdb_to_idx = {file: idx for idx, file in enumerate(K_hier['PDB_File'])}
# Apply the function to each row
K_hier['Summed_Distances_Within_Cluster'] = K_hier.apply(calculate_summed_distance_within_cluster_spatial_sub, axis=1)
Found 11 number of clusters... labeled 309 structures
# Plot each 3D-clustered subcluster in 2D
for i in range(len(K_hier['Cluster'].unique())):
    plot_projected_cluster(i+1, projected_alignment=False, X=X_sub, K_labeled=K_hier, save_path=f"plots/max_clusters/sub{subcluster_num}_max_cluster")
Saved superimposed structure to superposed_cluster_1.cif
Number of structures in cluster 1: 48
Mean RMSD within cluster: 2.73 Å
Standard deviation of RMSDs within cluster: 0.79 Å
Standard deviation of RMSDs superposition to centroid structure: 0.67 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.76
Variance explained by PC2: 0.23
Saved superimposed structure to superposed_cluster_2.cif
Number of structures in cluster 2: 31
Mean RMSD within cluster: 2.98 Å
Standard deviation of RMSDs within cluster: 1.20 Å
Standard deviation of RMSDs superposition to centroid structure: 0.61 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.84
Variance explained by PC2: 0.16
Saved superimposed structure to superposed_cluster_3.cif
Number of structures in cluster 3: 27
Mean RMSD within cluster: 2.88 Å
Standard deviation of RMSDs within cluster: 0.83 Å
Standard deviation of RMSDs superposition to centroid structure: 0.78 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.75
Variance explained by PC2: 0.25
Saved superimposed structure to superposed_cluster_4.cif
Number of structures in cluster 4: 45
Mean RMSD within cluster: 2.35 Å
Standard deviation of RMSDs within cluster: 0.70 Å
Standard deviation of RMSDs superposition to centroid structure: 0.50 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.70
Variance explained by PC2: 0.29
Saved superimposed structure to superposed_cluster_5.cif
Number of structures in cluster 5: 9
Mean RMSD within cluster: 0.54 Å
Standard deviation of RMSDs within cluster: 0.19 Å
Standard deviation of RMSDs superposition to centroid structure: 0.20 Å
Total variance explained by best-fit plane: ~0.99
Variance explained by PC1: 0.71
Variance explained by PC2: 0.28
Saved superimposed structure to superposed_cluster_6.cif
Number of structures in cluster 6: 21
Mean RMSD within cluster: 3.16 Å
Standard deviation of RMSDs within cluster: 0.97 Å
Standard deviation of RMSDs superposition to centroid structure: 0.51 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.74
Variance explained by PC2: 0.26
Saved superimposed structure to superposed_cluster_7.cif
Number of structures in cluster 7: 34
Mean RMSD within cluster: 3.07 Å
Standard deviation of RMSDs within cluster: 0.80 Å
Standard deviation of RMSDs superposition to centroid structure: 0.43 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.84
Variance explained by PC2: 0.16
Saved superimposed structure to superposed_cluster_8.cif
Number of structures in cluster 8: 42
Mean RMSD within cluster: 3.13 Å
Standard deviation of RMSDs within cluster: 1.03 Å
Standard deviation of RMSDs superposition to centroid structure: 0.76 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.76
Variance explained by PC2: 0.23
Saved superimposed structure to superposed_cluster_9.cif
Number of structures in cluster 9: 11
Mean RMSD within cluster: 2.28 Å
Standard deviation of RMSDs within cluster: 1.15 Å
Standard deviation of RMSDs superposition to centroid structure: 0.83 Å
Total variance explained by best-fit plane: ~0.99
Variance explained by PC1: 0.75
Variance explained by PC2: 0.24
Saved superimposed structure to superposed_cluster_10.cif
Number of structures in cluster 10: 14
Mean RMSD within cluster: 3.10 Å
Standard deviation of RMSDs within cluster: 1.39 Å
Standard deviation of RMSDs superposition to centroid structure: 0.51 Å
Total variance explained by best-fit plane: ~1.00
Variance explained by PC1: 0.69
Variance explained by PC2: 0.30
Saved superimposed structure to superposed_cluster_11.cif
Number of structures in cluster 11: 27
Mean RMSD within cluster: 2.51 Å
Standard deviation of RMSDs within cluster: 1.17 Å
Standard deviation of RMSDs superposition to centroid structure: 0.79 Å
Total variance explained by best-fit plane: ~0.99
Variance explained by PC1: 0.82
Variance explained by PC2: 0.17