Figure 2

Overview

Figure 2 compares prediction-score distributions across truth categories. The figure starts from scored candidate sites rather than from the aggregate benchmark summaries. This is necessary because the plot compares score distributions for several site classes: experimentally validated true off-target sites, experimentally tested false sites, sites with unknown truth status, and candidate sites that were scored by a tool but not experimentally sequenced.

Two tables provide these inputs. figure_2_prediction_score_categories.csv contains one row per scored candidate site and tool, with the raw tool score and the truth category assigned to that site. figure_2_true_false_score_significance.csv summarizes the statistical separation between scores assigned to validated true and false sites for each tool. The first table is used to draw the score distributions; the second table records the corresponding true-versus-false comparison.

Input tables

data/zenodo/figure2_score_distributions/figure_2_prediction_score_categories.csv
data/zenodo/figure2_score_distributions/figure_2_true_false_score_significance.csv

Code

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import Image, display

from offtarget_benchmark.layout import repo_layout

layout = repo_layout()
asset_dir = layout.data_dir / 'zenodo' / 'figure2_score_distributions'

prediction_categories = pd.read_csv(asset_dir / 'figure_2_prediction_score_categories.csv', low_memory=False)
significance = pd.read_csv(asset_dir / 'figure_2_true_false_score_significance.csv')

{
    'rows': len(prediction_categories),
    'tools': prediction_categories['tool'].nunique(),
    'truth_categories': sorted(prediction_categories['truth_category'].dropna().astype(str).unique().tolist()),
}

{'rows': 2769527,
 'tools': 10,
 'truth_categories': ['false', 'not_sequenced', 'true']}

Score categories

Code

prediction_categories['raw_score'] = pd.to_numeric(prediction_categories['raw_score'], errors='coerce')
prediction_categories['truth_category'] = prediction_categories['truth_category'].astype('string')

category_counts = (
    prediction_categories
    .groupby(['tool', 'truth_category'], dropna=False)
    .size()
    .rename('n_rows')
    .reset_index()
    .sort_values(['tool', 'truth_category'])
)
category_counts.head(12)

	tool	truth_category	n_rows
0	CCTop	false	395
1	CCTop	not_sequenced	19720
2	CCTop	true	127
3	CRISOT	false	1653
4	CRISOT	not_sequenced	562001
5	CRISOT	true	427
6	CRISPOR	false	348
7	CRISPOR	not_sequenced	25128
8	CRISPOR	true	125
9	CRISPROFF	false	1641
10	CRISPROFF	not_sequenced	559224
11	CRISPROFF	true	384

The plotted categories are true, false, unknown, and not_sequenced when present for a given tool. The score scale is not harmonized across tools; each panel should be read within tool.

True versus false score summary

Code

significance.sort_values(['q_value_bh', 'tool']).head(10)

	tool	n_true	n_false	true_median_score	false_median_score	mannwhitney_u	p_value	q_value_bh	significant_0_05
0	CRISPROFF	384	1641	8.353244	1.385921	474511.5	6.684638e-54	6.684638e-53	True
1	MOFF	427	1653	0.017030	0.002636	502264.5	1.581551e-41	7.907756e-41	True
2	CRISOT	427	1653	0.699222	0.624388	487987.5	2.792779e-34	9.309262e-34	True
3	CRISPRitz_mismatch	154	692	-4.000000	-5.000000	73731.5	1.074504e-14	2.686260e-14	True
4	CRISPRitz_cfd	154	692	0.224576	0.114296	72546.0	2.163040e-12	4.326081e-12	True
5	CCTop	127	395	-4.000000	-4.000000	33714.5	5.132583e-10	8.554306e-10	True
6	Cas-OFFinder	141	573	-5.000000	-5.000000	52937.0	1.842713e-09	2.632447e-09	True
7	CRISPOR	125	348	0.275862	0.148387	29156.5	1.603561e-08	2.004451e-08	True
8	GuideScan2	38	238	-4.000000	-5.000000	6127.0	2.594302e-04	2.882558e-04	True
9	FlashFry	27	203	-4.000000	-5.000000	3634.5	3.726627e-03	3.726627e-03	True

This table summarizes the true-versus-false score separation reported alongside the score-distribution figure.

Figure generation

Code

figure_dir = layout.docs_dir / 'generated_figures'
figure_dir.mkdir(parents=True, exist_ok=True)

tool_list = sorted(prediction_categories['tool'].dropna().astype(str).unique().tolist())
fig, axes = plt.subplots(1, len(tool_list), figsize=(4.5 * len(tool_list), 4.8), squeeze=False)
category_order = ['true', 'false', 'unknown', 'not_sequenced']

for idx, tool in enumerate(tool_list):
    ax = axes[0, idx]
    sub = prediction_categories[prediction_categories['tool'] == tool].copy()
    cats = [cat for cat in category_order if cat in set(sub['truth_category'].astype(str))]
    data = [
        sub.loc[sub['truth_category'] == cat, 'raw_score'].dropna().astype(float).tolist()
        for cat in cats
    ]
    if any(data):
        ax.boxplot(data, tick_labels=cats)
        for pos, values in enumerate(data, start=1):
            jitter = np.linspace(-0.12, 0.12, num=len(values)) if values else []
            ax.scatter(np.full(len(values), pos) + jitter, values, s=12, alpha=0.25)
    else:
        ax.text(0.5, 0.5, 'No calls in this view', ha='center', va='center', transform=ax.transAxes)
        ax.set_xticks([])
    ax.set_title(tool)
    ax.set_ylabel('Raw score')
    ax.grid(axis='y', alpha=0.2)

fig.suptitle('Figure 2. Score Distribution By Tool and Truth Category')
fig.tight_layout(rect=[0, 0, 1, 0.94])
out_path = figure_dir / 'figure_2_score_distribution_by_tool_and_truth_category.png'
fig.savefig(out_path, dpi=300)
plt.close(fig)

display(Image(filename=str(out_path)))