Figure 3

Overview

Figure 3 summarizes ranking performance across tools. The left panels show the precision recall curves. The right panels show average precision with guide bootstrap intervals.

Input tables

  • benchmark_pr_curves.csv
  • benchmark_pr_summary.csv
  • benchmark_matched_truth_long.csv
  • benchmark_by_guide.csv
Code
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import Image, display

from offtarget_benchmark.layout import repo_layout

layout = repo_layout()
benchmark_dir = layout.results_dir / 'benchmark_runs' / 'manuscript_primary'

pr_curves = pd.read_csv(benchmark_dir / 'benchmark_pr_curves.csv', low_memory=False)
pr_summary = pd.read_csv(benchmark_dir / 'benchmark_pr_summary.csv', low_memory=False)
matched_long = pd.read_csv(benchmark_dir / 'benchmark_matched_truth_long.csv', low_memory=False)
by_guide = pd.read_csv(benchmark_dir / 'benchmark_by_guide.csv', low_memory=False)

{
    'pr_curves_rows': len(pr_curves),
    'pr_summary_rows': len(pr_summary),
    'matched_truth_rows': len(matched_long),
    'by_guide_rows': len(by_guide),
}
{'pr_curves_rows': 5331,
 'pr_summary_rows': 10,
 'matched_truth_rows': 5321,
 'by_guide_rows': 260}

Stored average precision summary

Code
pr_summary[['tool', 'average_precision', 'best_f1', 'n_true_sites']].sort_values(
    'average_precision', ascending=False
).head(10)
tool average_precision best_f1 n_true_sites
9 MOFF 0.311067 0.375000 118
2 CRISOT 0.296917 0.374429 118
4 CRISPRitz_cfd 0.236436 0.342857 118
6 CRISPROFF 0.235080 0.312883 118
3 CRISPOR 0.227962 0.362832 118
5 CRISPRitz_mismatch 0.195054 0.306931 118
0 Cas-OFFinder 0.174777 0.322581 118
1 CCTop 0.163414 0.326761 118
8 GuideScan2 0.061821 0.156098 118
7 FlashFry 0.009670 0.091954 118

The stored summary already contains the global average precision value for each tool. The remaining steps add guide bootstrap intervals and prepare the plotted tables.

Bootstrap helper functions

Code
DISCRETE_TOOLS = {'Cas-OFFinder', 'GuideScan2', 'CCTop', 'FlashFry', 'CRISPRitz_mismatch'}

def normalize_truth_status(value):
    if pd.isna(value):
        return 'unknown'
    if isinstance(value, (bool, np.bool_)):
        return 'true' if bool(value) else 'false'
    text = str(value).strip().lower()
    if text in {'true', 'off'}:
        return 'true'
    if text in {'false', 'on'}:
        return 'false'
    return 'unknown'


def average_precision_from_ranked_rows(rows: pd.DataFrame, n_true_sites: int) -> float:
    if n_true_sites <= 0:
        return np.nan
    ranked = rows.dropna(subset=['rank']).sort_values(['rank', 'match_key']).reset_index(drop=True)
    if ranked.empty:
        return 0.0
    is_true = ranked['truth_status'].map(normalize_truth_status).eq('true').to_numpy()
    if not is_true.any():
        return 0.0
    precision = np.cumsum(is_true) / np.arange(1, len(is_true) + 1)
    return float(precision[is_true].sum() / n_true_sites)

normalize_truth_status standardizes the truth labels used in the matched site table. average_precision_from_ranked_rows recalculates average precision from an already ranked set of matched rows.

Prepare matched rows and guide counts

Code
rng = np.random.default_rng(20260421)
matched_long['truth_status'] = matched_long['truth_status'].map(normalize_truth_status)
matched_long['rank'] = pd.to_numeric(matched_long['rank'], errors='coerce')
by_guide['n_truth_true'] = pd.to_numeric(by_guide['n_truth_true'], errors='coerce').fillna(0).astype(int)

The bootstrap operates at guide level. The matched site table therefore needs a numeric rank column and the guide table needs a numeric count of validated true sites.

Bootstrap intervals by tool

Code
ci_rows = []
bootstrap_reps = 250
for _, summary_row in pr_summary.sort_values('tool').iterrows():
    tool = str(summary_row['tool'])
    tool_matched = matched_long[matched_long['tool'].astype(str) == tool].copy()
    tool_guides = by_guide[by_guide['tool'].astype(str) == tool].copy()
    guide_records = tool_guides[['guide_key', 'n_truth_true']].copy()
    guide_records['guide_key'] = guide_records['guide_key'].astype(str)
    guide_records = guide_records[guide_records['n_truth_true'] > 0]
    guide_keys = guide_records['guide_key'].to_numpy()
    truth_by_guide = guide_records.set_index('guide_key')['n_truth_true'].to_dict()
    matched_by_guide = {
        str(guide): sub.copy()
        for guide, sub in tool_matched.groupby(tool_matched['guide_key'].astype(str))
    }

    samples = []
    if len(guide_keys) > 0:
        for _ in range(bootstrap_reps):
            sampled_guides = rng.choice(guide_keys, size=len(guide_keys), replace=True)
            n_true = int(sum(int(truth_by_guide.get(guide, 0)) for guide in sampled_guides))
            sampled_rows = [matched_by_guide[guide] for guide in sampled_guides if guide in matched_by_guide]
            sampled = pd.concat(sampled_rows, ignore_index=True) if sampled_rows else tool_matched.iloc[0:0].copy()
            samples.append(average_precision_from_ranked_rows(sampled, n_true))
    finite = pd.Series(samples, dtype='float64').dropna()
    average_precision = float(pd.to_numeric(pd.Series([summary_row['average_precision']]), errors='coerce').iloc[0])
    ci_rows.append({
        'tool': tool,
        'average_precision': average_precision,
        'ap_ci_low_95': float(np.percentile(finite, 2.5)) if not finite.empty else average_precision,
        'ap_ci_high_95': float(np.percentile(finite, 97.5)) if not finite.empty else average_precision,
    })

pr_auc_ci = pd.DataFrame(ci_rows)
pr_auc_ci.sort_values('average_precision', ascending=False).head(10)
tool average_precision ap_ci_low_95 ap_ci_high_95
9 MOFF 0.311067 0.218205 0.633427
1 CRISOT 0.296917 0.223800 0.555427
4 CRISPRitz_cfd 0.236436 0.129960 0.436818
3 CRISPROFF 0.235080 0.159578 0.499200
2 CRISPOR 0.227962 0.132015 0.429939
5 CRISPRitz_mismatch 0.195054 0.104998 0.397588
6 Cas-OFFinder 0.174777 0.091184 0.404160
0 CCTop 0.163414 0.091637 0.356431
8 GuideScan2 0.061821 0.013338 0.175939
7 FlashFry 0.009670 0.001769 0.043003

Each bootstrap replicate resamples guides, reconstructs the matched rows for that sample, and recalculates average precision. The resulting interval reflects variation across guide composition.

Plot tables

Code
plot_df = pr_curves.copy()
plot_df['score_section'] = plot_df['tool'].astype(str).map(
    lambda tool: 'discrete standard' if tool in DISCRETE_TOOLS else 'continuous standard'
)
pr_auc_df = pr_summary[['tool', 'average_precision']].copy()
pr_auc_df['average_precision'] = pd.to_numeric(pr_auc_df['average_precision'], errors='coerce')
pr_auc_df = pr_auc_df.dropna(subset=['average_precision']).sort_values(
    ['average_precision', 'tool'],
    ascending=[False, True],
).reset_index(drop=True)
pr_auc_df['score_section'] = pr_auc_df['tool'].astype(str).map(
    lambda tool: 'discrete standard' if tool in DISCRETE_TOOLS else 'continuous standard'
)
pr_auc_df = pr_auc_df.merge(pr_auc_ci[['tool', 'ap_ci_low_95', 'ap_ci_high_95']], on='tool', how='left')
pr_auc_df.head()
tool average_precision score_section ap_ci_low_95 ap_ci_high_95
0 MOFF 0.311067 continuous standard 0.218205 0.633427
1 CRISOT 0.296917 continuous standard 0.223800 0.555427
2 CRISPRitz_cfd 0.236436 continuous standard 0.129960 0.436818
3 CRISPROFF 0.235080 continuous standard 0.159578 0.499200
4 CRISPOR 0.227962 continuous standard 0.132015 0.429939

plot_df holds the stored precision recall coordinates. pr_auc_df combines the stored scalar summary with the bootstrap interval.

Figure generation

Code
figure_dir = layout.docs_dir / 'generated_figures'
figure_dir.mkdir(parents=True, exist_ok=True)

sections = ['discrete standard', 'continuous standard']
fig, axes = plt.subplots(2, 2, figsize=(13.5, 8.8), gridspec_kw={'width_ratios': [1.8, 1.0]})

for row_idx, section in enumerate(sections):
    ax = axes[row_idx, 0]
    section_curves = plot_df[plot_df['score_section'] == section]
    for tool, sub in section_curves.groupby('tool', sort=True):
        sub = sub.sort_values(['recall', 'rank'], ascending=[True, True]).reset_index(drop=True)
        ax.plot(sub['recall'], sub['precision'], linewidth=2, label=str(tool))
    ax.set_xlim(-0.02, 1.02)
    ax.set_ylim(-0.02, 1.02)
    ax.set_xlabel('Recall')
    ax.set_ylabel('Precision')
    ax.set_title(f'Precision recall curves: {section.title()} tools')
    ax.grid(alpha=0.2)
    if not section_curves.empty:
        ax.legend(loc='lower left', fontsize=7)

    auc_ax = axes[row_idx, 1]
    section_auc = pr_auc_df[pr_auc_df['score_section'] == section].copy()
    y_positions = np.arange(len(section_auc))
    low = pd.to_numeric(section_auc['ap_ci_low_95'], errors='coerce')
    high = pd.to_numeric(section_auc['ap_ci_high_95'], errors='coerce')
    value = section_auc['average_precision'].astype(float)
    xerr = np.vstack([(value - low).clip(lower=0), (high - value).clip(lower=0)]) if low.notna().all() and high.notna().all() else None
    auc_ax.barh(
        y_positions,
        section_auc['average_precision'],
        xerr=xerr,
        color='#c46a2c' if section == 'continuous standard' else '#2c5f77',
        edgecolor='white',
        linewidth=0.8,
        error_kw={'elinewidth': 1.0, 'capsize': 3, 'capthick': 1.0},
    )
    auc_ax.set_yticks(y_positions)
    auc_ax.set_yticklabels(section_auc['tool'])
    auc_ax.invert_yaxis()
    auc_ax.set_xlim(0.0, 1.0)
    auc_ax.set_xlabel('Average precision')
    auc_ax.set_title(f'Average precision with 95% CI: {section.title()} tools')
    auc_ax.grid(axis='x', alpha=0.2)
    for idx, plotted_value in enumerate(section_auc['average_precision']):
        auc_ax.text(min(float(plotted_value) + 0.015, 0.985), idx, f'{float(plotted_value):.2f}', va='center', fontsize=7)

fig.suptitle('Figure 3. Precision recall curves with average precision comparison')
fig.tight_layout(rect=[0, 0, 1, 0.96])
out_path = figure_dir / 'figure_3_precision_recall_curves.png'
fig.savefig(out_path, dpi=300)
plt.close(fig)

display(Image(filename=str(out_path)))