21_Analysis_Datastructure/diff_plotting_chunk_coeffs.py at master · NikJur/21_Analysis_Datastructure · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
import os
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress

# --- CONFIGURATION ---

PROTEINS_TO_ANALYZE = {
    "RbsB": 1,
    "SpeA": 2,
    "FkpA": 3
}

folder_paths = [
    "/ceph/users/akv16273/34_Elcock_13abundant_inEColi/21_Analysis_Datastructure/20_AMBER/",
    "/ceph/users/akv16273/34_Elcock_13abundant_inEColi/21_Analysis_Datastructure/12_AA/1_monomer/",
    "/ceph/users/akv16273/34_Elcock_13abundant_inEColi/21_Analysis_Datastructure/13_SIRAH/",
    "/ceph/users/akv16273/34_Elcock_13abundant_inEColi/21_Analysis_Datastructure/9_Martini3/3_303K/",
    "/ceph/users/akv16273/34_Elcock_13abundant_inEColi/21_Analysis_Datastructure/10_Martini2_nonPW/3_303K/",
    "/ceph/users/akv16273/34_Elcock_13abundant_inEColi/21_Analysis_Datastructure/11_Martini2_PW/3_303K/"
]

Y_LABEL = ["AMBER", "CHARMM", "SIRAH hybrid", "M3", "M2", "M2PW"]
COLOURS = ['olive', 'cyan', 'brown', 'magenta', 'orange', 'blue']

# --- FONT/LABEL SIZE CONFIG ---
plt.rcParams.update({
    'font.size': 18,
    'axes.labelsize': 18,
    'axes.titlesize': 18,
    'xtick.labelsize': 16,
    'ytick.labelsize': 16,
    'legend.fontsize': 18,
})

def parse_file(filename, column_index):
    tau, msd = [], []
    with open(filename, 'r') as file:
        for line in file:
            if line.startswith("#") or line.startswith("@"):
                continue
            data = line.split()
            try:
                tau.append(float(data[0]))
                msd.append(float(data[column_index]))
            except (ValueError, IndexError):
                print(f"Warning: Could not parse line in {filename}: {line.strip()}")
                continue
    return tau, msd

# --- STORAGE FOR CROSS-PROTEIN COMPARISON ---
protein_means = {k: None for k in PROTEINS_TO_ANALYZE}
protein_stdevs = {k: None for k in PROTEINS_TO_ANALYZE}

# --- PLOTTING SETUP ---
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
x_positions = np.arange(len(Y_LABEL))

# Loop through each protein to create its subplot and collect means/SDs
for ax, (protein_name, column_idx) in zip(axes, PROTEINS_TO_ANALYZE.items()):
    all_across_FFs = []
    averages_across_FFs = []
    stdevs_across_FFs = []

    for main_folder_path in folder_paths:
        all_diffusion_coefficients = []

        # Loop over replica folders ('Take_*').
        if not os.path.isdir(main_folder_path):
            averages_across_FFs.append(np.nan)
            stdevs_across_FFs.append(np.nan)
            all_across_FFs.append([])
            continue

        for folder_name in os.listdir(main_folder_path):
            if folder_name.startswith("Take_"):
                folder_path = os.path.join(main_folder_path, folder_name)
                filenames = [os.path.join(folder_path, f'diff_RbsB_SpeA_FkpA_{i * 100000}_{(i + 1) * 100000}.xvg') for i in range(10)]

                for filename in filenames:
                    if os.path.isfile(filename):
                        tau, msd = parse_file(filename, column_idx)
                        if len(tau) != len(msd) or not tau:
                            continue

                        time = np.array(tau)
                        msd_array = np.array(msd)

                        start_time, end_time = 2000, 4000
                        time_mask = (time >= start_time) & (time <= end_time)
                        if not np.any(time_mask):
                            continue

                        slope, _, _, _, _ = linregress(time[time_mask], msd_array[time_mask])
                        diffusion_coefficient = (slope / 6) * 1e5  # 10^7 nm^2/s
                        all_diffusion_coefficients.append(diffusion_coefficient)

        if all_diffusion_coefficients:
            averages_across_FFs.append(np.mean(all_diffusion_coefficients))
            stdevs_across_FFs.append(np.std(all_diffusion_coefficients))
            all_across_FFs.append(all_diffusion_coefficients)
        else:
            averages_across_FFs.append(np.nan)
            stdevs_across_FFs.append(np.nan)
            all_across_FFs.append([])

    # Store for cross-protein ratios later
    protein_means[protein_name] = np.array(averages_across_FFs, dtype=float)
    protein_stdevs[protein_name] = np.array(stdevs_across_FFs, dtype=float)

    # Print means ± SD for this protein
    print(f"--- Mean ± StDev Diffusion Coefficients for {protein_name} (D in 10^7 nm^2/s) ---")
    for i, label in enumerate(Y_LABEL):
        m, s = averages_across_FFs[i], stdevs_across_FFs[i]
        if not np.isnan(m):
            print(f"{label:>12}: {m:.4f} ± {s:.4f}")
        else:
            print(f"{label:>12}: Not Available")
    print("-" * 70)

    # Plot points and mean markers using numeric x, then set labels
    for i, folder_data in enumerate(all_across_FFs):
        if not folder_data:
            continue
        ax.scatter(np.full(len(folder_data), x_positions[i]), folder_data,
                   color=COLOURS[i], alpha=0.3, label=Y_LABEL[i], zorder=2)
        ax.scatter(x_positions[i], averages_across_FFs[i], color='black', marker='^', zorder=3)

    ax.set_title(protein_name)
    ax.set_xticks(x_positions)
    ax.set_xticklabels(Y_LABEL, rotation=45, ha="right")
    ax.set_ylim(0, 20)
    ax.set_ylabel('D (10$^7$ nm$^2$/s)')

    # Unique legend per subplot
    handles, labels = ax.get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    mean_marker = plt.Line2D([], [], color='black', marker='^', linestyle='None', label='Mean (in silico)')
    by_label['Mean (in silico)'] = mean_marker
    # ax.legend(by_label.values(), by_label.keys())

# --- PRINT PER-FF TABLES WITH RATIOS ---
def safe_ratio(a, b):
    return np.nan if (b is None or np.isnan(b) or b == 0) else (a / b)

print("\n=== Cross-protein comparison per force field ===")
for i, ff in enumerate(Y_LABEL):
    rbsb_m, rbsb_s = protein_means["RbsB"][i], protein_stdevs["RbsB"][i]
    spea_m, spea_s = protein_means["SpeA"][i], protein_stdevs["SpeA"][i]
    fkpa_m, fkpa_s = protein_means["FkpA"][i], protein_stdevs["FkpA"][i]

    fkp_over_spea = safe_ratio(fkpa_m, spea_m)
    rbs_over_spea = safe_ratio(rbsb_m, spea_m)
    rbs_over_fkp  = safe_ratio(rbsb_m, fkpa_m)

    print(f"\n[{ff}]")
    if all(np.isnan(x) for x in [rbsb_m, spea_m, fkpa_m]):
        print("  No data available.")
        continue

    def fmt(m, s): return "NA" if np.isnan(m) else f"{m:.4f} ± {s:.4f}"
    print(f"  RbsB : {fmt(rbsb_m, rbsb_s)}")
    print(f"  SpeA : {fmt(spea_m, spea_s)}")
    print(f"  FkpA : {fmt(fkpa_m, fkpa_s)}")

    def fmt_ratio(x): return "NA" if np.isnan(x) else f"{x:.2f}"
    print("  Scaling factors:")
    print(f"    FkpA / SpeA = {fmt_ratio(fkp_over_spea)}")
    print(f"    RbsB / SpeA = {fmt_ratio(rbs_over_spea)}")
    print(f"    RbsB / FkpA = {fmt_ratio(rbs_over_fkp)}")

# --- RANKING (LOWEST → HIGHEST) PER PROTEIN ---
def rank_labels_by_values(labels, values, tie_tol=1e-6):
    """Return a list of strings with ties merged using '=' and order arrows '<'."""
    # Filter out NaNs, keep pairs
    pairs = [(lab, val) for lab, val in zip(labels, values) if not np.isnan(val)]
    if not pairs:
        return "No data available."

    # Sort by value ascending
    pairs.sort(key=lambda x: x[1])

    # Build with tie handling
    out = []
    current_group = [pairs[0][0]]
    current_val = pairs[0][1]

    for lab, val in pairs[1:]:
        if np.isclose(val, current_val, rtol=0, atol=tie_tol):
            current_group.append(lab)
        else:
            out.append(" = ".join(current_group))
            out.append("<")
            current_group = [lab]
            current_val = val
    out.append(" = ".join(current_group))
    return " ".join(out)

print("\n=== Force field ranking (lowest → highest) ===")
sirah_lowest_flags = []
avg_ranks_per_ff = {ff: [] for ff in Y_LABEL}

for protein in ["RbsB", "SpeA", "FkpA"]:
    vals = protein_means[protein]
    line = rank_labels_by_values(Y_LABEL, vals, tie_tol=1e-6)
    print(f"{protein:>5}: {line}")

    # Consistency check: is SIRAH hybrid lowest (allowing for ties within tolerance)?
    if np.isnan(vals).all():
        sirah_lowest_flags.append(False)
    else:
        min_val = np.nanmin(vals)
        sirah_is_lowest = np.isclose(vals[Y_LABEL.index("SIRAH hybrid")], min_val, atol=1e-6, rtol=0)
        sirah_lowest_flags.append(bool(sirah_is_lowest))

    # Average rank per FF (1 = lowest). Handle ties by assigning the average of tied ranks.
    # Compute ranks with tie handling
    finite_mask = ~np.isnan(vals)
    finite_vals = vals[finite_mask]
    finite_labels = np.array(Y_LABEL)[finite_mask]
    if finite_vals.size:
        order = np.argsort(finite_vals)
        sorted_vals = finite_vals[order]
        sorted_labels = finite_labels[order]

        ranks = np.zeros_like(sorted_vals, dtype=float)
        start = 0
        while start < len(sorted_vals):
            end = start + 1
            while end < len(sorted_vals) and np.isclose(sorted_vals[end], sorted_vals[start], atol=1e-6, rtol=0):
                end += 1
            avg_rank = (start + 1 + end) / 2.0  # average of 1-based ranks in the tie group
            ranks[start:end] = avg_rank
            start = end

        for lab, rk in zip(sorted_labels, ranks):
            avg_ranks_per_ff[lab].append(rk)

# Quick summary about SIRAH consistency and average ranks
all_sirah_lowest = all(sirah_lowest_flags)
print("\n--- Summary ---")
print(f"SIRAH hybrid lowest for all three proteins: {'Yes' if all_sirah_lowest else 'No'}")
print("Average rank across proteins (1 = lowest):")
for ff in Y_LABEL:
    if avg_ranks_per_ff[ff]:
        print(f"  {ff:12}: {np.mean(avg_ranks_per_ff[ff]):.2f}")
    else:
        print(f"  {ff:12}: NA")

# --- SAVE PLOT ---
plt.tight_layout()
output_filename = "RbsB_SpeA_FkpA_diff_coeff.png"
plt.savefig(output_filename, dpi=600, bbox_inches='tight')
plt.show()