Figure 5

Coverage of CRISPRoffT Sites by Prediction Tools

Figure 5 compares the CRISPRoffT off-target sites with the site sets evaluated by each prediction tool. The y-axis gives counts of unique CRISPRoffT site identities, defined by guide sequence, off-target sequence, and chromosome.

Each bar is split into four categories: validated true sites that were scored by the tool, labeled false sites that were scored by the tool, validated true sites that were not scored, and labeled false sites that were not scored. The panels separate discrete standard tools, continuous standard tools, and ML-based tools.

The ML tools were available only for the full-length no-bulge compatible candidate universe. In panel C, CRISPRoffT sites outside that compatible space therefore remain in the denominator and appear as unscored for the ML tools.

Inputs

The CRISPRoffT site set comes from the filtered manuscript truth table:

  • data/manuscript/manuscript_primary.csv

The standard prediction contracts are expected under:

  • data/zenodo/standard_tool_predictions/

The ML prediction contracts are expected under:

  • data/zenodo/no_bulge_ml_tool_predictions/

These prediction contracts are large Zenodo-backed artifacts. During local development they can be symlinked into the expected directories.

Load CRISPRoffT Site Identities

Code
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from offtarget_benchmark.layout import repo_layout

layout = repo_layout()
truth_path = layout.data_dir / "manuscript" / "manuscript_primary.csv"
standard_prediction_dir = layout.data_dir / "zenodo" / "standard_tool_predictions"
ml_prediction_dir = layout.data_dir / "zenodo" / "no_bulge_ml_tool_predictions"
figure_dir = layout.docs_dir / "generated_figures"
figure_dir.mkdir(parents=True, exist_ok=True)

def normalize_truth_status(value):
    if pd.isna(value):
        return "unknown"
    if isinstance(value, (bool, np.bool_)):
        return "true" if bool(value) else "false"
    text = str(value).strip().lower()
    if text in {"true", "1", "yes", "off"}:
        return "true"
    if text in {"false", "0", "no", "on"}:
        return "false"
    return "unknown"

truth = pd.read_csv(truth_path, low_memory=False)
truth["truth_category"] = truth["truth_status"].map(normalize_truth_status)
truth_sites = truth[truth["truth_category"].isin(["true", "false"])].copy()
truth_sites["match_key"] = (
    truth_sites["guide_seq"].astype(str)
    + "::"
    + truth_sites["offtarget_seq"].astype(str)
    + "::"
    + truth_sites["chromosome"].astype(str)
)
truth_sites = truth_sites[["match_key", "truth_category"]].drop_duplicates("match_key").reset_index(drop=True)

truth_sites["truth_category"].value_counts().rename_axis("truth_category").reset_index(name="n_unique_sites")
truth_category n_unique_sites
0 false 1740
1 true 142

The figure uses unique site identities. This avoids counting the same site/tool overlap more than once when the source table contains repeated rows for the same guide, off-target sequence, and chromosome.

Define Tool Groups and Read Prediction Contracts

Code
DISCRETE_TOOLS = ["CRISPRitz_mismatch", "Cas-OFFinder", "CCTop", "GuideScan2", "FlashFry"]
CONTINUOUS_TOOLS = ["MOFF", "CRISOT", "CRISPRitz_cfd", "CRISPROFF", "CRISPOR"]
ML_TOOLS = ["CRISPert", "CRISPR-OFFT", "CRISPR-IP", "CnnCRISPR"]
TOOL_LABELS = {
    "CRISPRitz_mismatch": "CRISPRitz_mm",
    "Cas-OFFinder": "Cas-OFFinder",
    "CCTop": "CCTop",
    "GuideScan2": "GuideScan",
    "FlashFry": "FlashFry",
    "CRISPRitz_cfd": "CRISPRitz_cfd",
}

TOOL_FILES = {
    "Cas-OFFinder": standard_prediction_dir / "prediction_contract_cas_offinder.csv",
    "CCTop": standard_prediction_dir / "prediction_contract_cctop.csv",
    "CRISPRitz_mismatch": standard_prediction_dir / "prediction_contract_crispritz_mismatch.csv",
    "FlashFry": standard_prediction_dir / "prediction_contract_flashfry.csv",
    "GuideScan2": standard_prediction_dir / "prediction_contract_guidescan2.csv",
    "CRISOT": standard_prediction_dir / "prediction_contract_crisot.csv",
    "CRISPOR": standard_prediction_dir / "prediction_contract_crispor.csv",
    "CRISPROFF": standard_prediction_dir / "prediction_contract_crisproff.csv",
    "CRISPRitz_cfd": standard_prediction_dir / "prediction_contract_crispritz_cfd.csv",
    "MOFF": standard_prediction_dir / "prediction_contract_moff.csv",
    "CnnCRISPR": ml_prediction_dir / "prediction_contract_cnncrispr.csv",
    "CRISPR-IP": ml_prediction_dir / "prediction_contract_crisprip.csv",
    "CRISPR-OFFT": ml_prediction_dir / "prediction_contract_crispr_offt.csv",
    "CRISPert": ml_prediction_dir / "prediction_contract_crispert.csv",
}

missing = [str(path) for path in TOOL_FILES.values() if not path.exists()]
if missing:
    raise FileNotFoundError("Missing prediction contracts:\n" + "\n".join(missing))

def read_scored_keys(path):
    contract = pd.read_csv(path, usecols=["guide_key", "off_target_seq", "chromosome"], low_memory=False)
    keys = (
        contract["guide_key"].astype(str)
        + "::"
        + contract["off_target_seq"].astype(str)
        + "::"
        + contract["chromosome"].astype(str)
    )
    return set(keys.dropna().astype(str))

coverage_rows = []
for panel, tools in [
    ("A. Discrete standard tools", DISCRETE_TOOLS),
    ("B. Continuous standard tools", CONTINUOUS_TOOLS),
    ("C. ML tools", ML_TOOLS),
]:
    for tool in tools:
        scored_keys = read_scored_keys(TOOL_FILES[tool])
        scored = truth_sites["match_key"].astype(str).isin(scored_keys)
        categories = pd.DataFrame({"truth_category": truth_sites["truth_category"], "scored": scored})
        for truth_category, is_scored, label in [
            ("true", True, "scored (true)"),
            ("false", True, "scored (false)"),
            ("true", False, "unscored (true)"),
            ("false", False, "unscored (false)"),
        ]:
            n_sites = int(((categories["truth_category"] == truth_category) & (categories["scored"] == is_scored)).sum())
            coverage_rows.append({"panel": panel, "tool": tool, "category": label, "n_sites": n_sites})

coverage = pd.DataFrame(coverage_rows)
coverage.head(12)
panel tool category n_sites
0 A. Discrete standard tools CRISPRitz_mismatch scored (true) 100
1 A. Discrete standard tools CRISPRitz_mismatch scored (false) 642
2 A. Discrete standard tools CRISPRitz_mismatch unscored (true) 42
3 A. Discrete standard tools CRISPRitz_mismatch unscored (false) 1098
4 A. Discrete standard tools Cas-OFFinder scored (true) 89
5 A. Discrete standard tools Cas-OFFinder scored (false) 530
6 A. Discrete standard tools Cas-OFFinder unscored (true) 53
7 A. Discrete standard tools Cas-OFFinder unscored (false) 1210
8 A. Discrete standard tools CCTop scored (true) 80
9 A. Discrete standard tools CCTop scored (false) 362
10 A. Discrete standard tools CCTop unscored (true) 62
11 A. Discrete standard tools CCTop unscored (false) 1378

A site is scored when it appears in the standardized prediction contract for the tool. For native search tools this means the site was returned by the tool. For pair-scoring tools and ML tools this means the site was part of the candidate universe supplied to that scorer.

Category Counts

Code
coverage.pivot_table(index=["panel", "tool"], columns="category", values="n_sites", fill_value=0).reset_index()
category panel tool scored (false) scored (true) unscored (false) unscored (true)
0 A. Discrete standard tools CCTop 362.0 80.0 1378.0 62.0
1 A. Discrete standard tools CRISPRitz_mismatch 642.0 100.0 1098.0 42.0
2 A. Discrete standard tools Cas-OFFinder 530.0 89.0 1210.0 53.0
3 A. Discrete standard tools FlashFry 191.0 15.0 1549.0 127.0
4 A. Discrete standard tools GuideScan2 225.0 24.0 1515.0 118.0
5 B. Continuous standard tools CRISOT 684.0 142.0 1056.0 0.0
6 B. Continuous standard tools CRISPOR 315.0 81.0 1425.0 61.0
7 B. Continuous standard tools CRISPROFF 676.0 107.0 1064.0 35.0
8 B. Continuous standard tools CRISPRitz_cfd 642.0 100.0 1098.0 42.0
9 B. Continuous standard tools MOFF 684.0 142.0 1056.0 0.0
10 C. ML tools CRISPR-IP 683.0 138.0 1057.0 4.0
11 C. ML tools CRISPR-OFFT 683.0 138.0 1057.0 4.0
12 C. ML tools CRISPert 683.0 138.0 1057.0 4.0
13 C. ML tools CnnCRISPR 683.0 138.0 1057.0 4.0

Render the Figure

Code
category_order = ["scored (false)", "scored (true)", "unscored (true)", "unscored (false)"]
colors = {
    "scored (true)": "#08306b",
    "scored (false)": "#9ecae1",
    "unscored (true)": "#7f0000",
    "unscored (false)": "#fcae91",
}
panel_tools = {
    "A. Discrete standard tools": DISCRETE_TOOLS,
    "B. Continuous standard tools": CONTINUOUS_TOOLS,
    "C. ML tools": ML_TOOLS,
}

fig, axes = plt.subplots(1, 3, figsize=(15.5, 5.4), sharey=True)
for ax, (panel, tools) in zip(axes, panel_tools.items()):
    sub = coverage[coverage["panel"] == panel]
    bottom = np.zeros(len(tools), dtype=float)
    x = np.arange(len(tools))
    for category in category_order:
        values = (
            sub[sub["category"] == category]
            .set_index("tool")
            .reindex(tools)["n_sites"]
            .fillna(0)
            .to_numpy(dtype=float)
        )
        ax.bar(x, values, bottom=bottom, color=colors[category], label=category, width=0.78)
        bottom += values
    ax.set_title(panel, loc="left", fontweight="bold")
    ax.set_xticks(x)
    ax.set_xticklabels([TOOL_LABELS.get(tool, tool) for tool in tools], rotation=45, ha="right")
    ax.set_xlabel("Prediction tool")
    ax.grid(axis="y", alpha=0.25)
axes[0].set_ylabel("CRISPRoffT unique site count")
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, loc="upper center", ncol=4, frameon=False, bbox_to_anchor=(0.5, 1.04))
fig.suptitle("Overlap between CRISPRoffT sites and sites evaluated by prediction tools", y=1.13, fontweight="bold")
fig.tight_layout(rect=[0, 0, 1, 0.98])
fig.savefig(figure_dir / "figure_5_site_coverage_stacked_counts.pdf", dpi=300, bbox_inches="tight")
fig.savefig(figure_dir / "figure_5_site_coverage_stacked_counts.png", dpi=300, bbox_inches="tight")
plt.close(fig)