Figure 5

Coverage of CRISPRoffT Sites by Prediction Tools

Figure 5 compares the CRISPRoffT off-target sites with the site sets evaluated by each prediction tool. The y-axis gives counts of unique CRISPRoffT site identities, defined by guide sequence, off-target sequence, and chromosome.

Each bar is split into four categories: validated true sites that were scored by the tool, labeled false sites that were scored by the tool, validated true sites that were not scored, and labeled false sites that were not scored. The panels separate discrete standard tools, continuous standard tools, and ML-based tools.

The ML tools were available only for the full-length no-bulge compatible candidate universe. In panel C, CRISPRoffT sites outside that compatible space therefore remain in the denominator and appear as unscored for the ML tools.

Inputs

The CRISPRoffT site set comes from the filtered manuscript truth table:

data/manuscript/manuscript_primary.csv

The standard prediction contracts are expected under:

data/zenodo/standard_tool_predictions/

The ML prediction contracts are expected under:

data/zenodo/no_bulge_ml_tool_predictions/

These prediction contracts are large Zenodo-backed artifacts. During local development they can be symlinked into the expected directories.

Load CRISPRoffT Site Identities

Code

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from offtarget_benchmark.layout import repo_layout

layout = repo_layout()
truth_path = layout.data_dir / "manuscript" / "manuscript_primary.csv"
standard_prediction_dir = layout.data_dir / "zenodo" / "standard_tool_predictions"
ml_prediction_dir = layout.data_dir / "zenodo" / "no_bulge_ml_tool_predictions"
figure_dir = layout.docs_dir / "generated_figures"
figure_dir.mkdir(parents=True, exist_ok=True)

def normalize_truth_status(value):
    if pd.isna(value):
        return "unknown"
    if isinstance(value, (bool, np.bool_)):
        return "true" if bool(value) else "false"
    text = str(value).strip().lower()
    if text in {"true", "1", "yes", "off"}:
        return "true"
    if text in {"false", "0", "no", "on"}:
        return "false"
    return "unknown"

truth = pd.read_csv(truth_path, low_memory=False)
truth["truth_category"] = truth["truth_status"].map(normalize_truth_status)
truth_sites = truth[truth["truth_category"].isin(["true", "false"])].copy()
truth_sites["match_key"] = (
    truth_sites["guide_seq"].astype(str)
    + "::"
    + truth_sites["offtarget_seq"].astype(str)
    + "::"
    + truth_sites["chromosome"].astype(str)
)
truth_sites = truth_sites[["match_key", "truth_category"]].drop_duplicates("match_key").reset_index(drop=True)

truth_sites["truth_category"].value_counts().rename_axis("truth_category").reset_index(name="n_unique_sites")

	truth_category	n_unique_sites
0	false	1740
1	true	142

The figure uses unique site identities. This avoids counting the same site/tool overlap more than once when the source table contains repeated rows for the same guide, off-target sequence, and chromosome.

Define Tool Groups and Read Prediction Contracts

Code

DISCRETE_TOOLS = ["CRISPRitz_mismatch", "Cas-OFFinder", "CCTop", "GuideScan2", "FlashFry"]
CONTINUOUS_TOOLS = ["MOFF", "CRISOT", "CRISPRitz_cfd", "CRISPROFF", "CRISPOR"]
ML_TOOLS = ["CRISPert", "CRISPR-OFFT", "CRISPR-IP", "CnnCRISPR"]
TOOL_LABELS = {
    "CRISPRitz_mismatch": "CRISPRitz_mm",
    "Cas-OFFinder": "Cas-OFFinder",
    "CCTop": "CCTop",
    "GuideScan2": "GuideScan",
    "FlashFry": "FlashFry",
    "CRISPRitz_cfd": "CRISPRitz_cfd",
}

TOOL_FILES = {
    "Cas-OFFinder": standard_prediction_dir / "prediction_contract_cas_offinder.csv",
    "CCTop": standard_prediction_dir / "prediction_contract_cctop.csv",
    "CRISPRitz_mismatch": standard_prediction_dir / "prediction_contract_crispritz_mismatch.csv",
    "FlashFry": standard_prediction_dir / "prediction_contract_flashfry.csv",
    "GuideScan2": standard_prediction_dir / "prediction_contract_guidescan2.csv",
    "CRISOT": standard_prediction_dir / "prediction_contract_crisot.csv",
    "CRISPOR": standard_prediction_dir / "prediction_contract_crispor.csv",
    "CRISPROFF": standard_prediction_dir / "prediction_contract_crisproff.csv",
    "CRISPRitz_cfd": standard_prediction_dir / "prediction_contract_crispritz_cfd.csv",
    "MOFF": standard_prediction_dir / "prediction_contract_moff.csv",
    "CnnCRISPR": ml_prediction_dir / "prediction_contract_cnncrispr.csv",
    "CRISPR-IP": ml_prediction_dir / "prediction_contract_crisprip.csv",
    "CRISPR-OFFT": ml_prediction_dir / "prediction_contract_crispr_offt.csv",
    "CRISPert": ml_prediction_dir / "prediction_contract_crispert.csv",
}

missing = [str(path) for path in TOOL_FILES.values() if not path.exists()]
if missing:
    raise FileNotFoundError("Missing prediction contracts:\n" + "\n".join(missing))

def read_scored_keys(path):
    contract = pd.read_csv(path, usecols=["guide_key", "off_target_seq", "chromosome"], low_memory=False)
    keys = (
        contract["guide_key"].astype(str)
        + "::"
        + contract["off_target_seq"].astype(str)
        + "::"
        + contract["chromosome"].astype(str)
    )
    return set(keys.dropna().astype(str))

coverage_rows = []
for panel, tools in [
    ("A. Discrete standard tools", DISCRETE_TOOLS),
    ("B. Continuous standard tools", CONTINUOUS_TOOLS),
    ("C. ML tools", ML_TOOLS),
]:
    for tool in tools:
        scored_keys = read_scored_keys(TOOL_FILES[tool])
        scored = truth_sites["match_key"].astype(str).isin(scored_keys)
        categories = pd.DataFrame({"truth_category": truth_sites["truth_category"], "scored": scored})
        for truth_category, is_scored, label in [
            ("true", True, "scored (true)"),
            ("false", True, "scored (false)"),
            ("true", False, "unscored (true)"),
            ("false", False, "unscored (false)"),
        ]:
            n_sites = int(((categories["truth_category"] == truth_category) & (categories["scored"] == is_scored)).sum())
            coverage_rows.append({"panel": panel, "tool": tool, "category": label, "n_sites": n_sites})

coverage = pd.DataFrame(coverage_rows)
coverage.head(12)

	panel	tool	category	n_sites
0	A. Discrete standard tools	CRISPRitz_mismatch	scored (true)	100
1	A. Discrete standard tools	CRISPRitz_mismatch	scored (false)	642
2	A. Discrete standard tools	CRISPRitz_mismatch	unscored (true)	42
3	A. Discrete standard tools	CRISPRitz_mismatch	unscored (false)	1098
4	A. Discrete standard tools	Cas-OFFinder	scored (true)	89
5	A. Discrete standard tools	Cas-OFFinder	scored (false)	530
6	A. Discrete standard tools	Cas-OFFinder	unscored (true)	53
7	A. Discrete standard tools	Cas-OFFinder	unscored (false)	1210
8	A. Discrete standard tools	CCTop	scored (true)	80
9	A. Discrete standard tools	CCTop	scored (false)	362
10	A. Discrete standard tools	CCTop	unscored (true)	62
11	A. Discrete standard tools	CCTop	unscored (false)	1378

A site is scored when it appears in the standardized prediction contract for the tool. For native search tools this means the site was returned by the tool. For pair-scoring tools and ML tools this means the site was part of the candidate universe supplied to that scorer.

Category Counts

Code

coverage.pivot_table(index=["panel", "tool"], columns="category", values="n_sites", fill_value=0).reset_index()

category	panel	tool	scored (false)	scored (true)	unscored (false)	unscored (true)
0	A. Discrete standard tools	CCTop	362.0	80.0	1378.0	62.0
1	A. Discrete standard tools	CRISPRitz_mismatch	642.0	100.0	1098.0	42.0
2	A. Discrete standard tools	Cas-OFFinder	530.0	89.0	1210.0	53.0
3	A. Discrete standard tools	FlashFry	191.0	15.0	1549.0	127.0
4	A. Discrete standard tools	GuideScan2	225.0	24.0	1515.0	118.0
5	B. Continuous standard tools	CRISOT	684.0	142.0	1056.0	0.0
6	B. Continuous standard tools	CRISPOR	315.0	81.0	1425.0	61.0
7	B. Continuous standard tools	CRISPROFF	676.0	107.0	1064.0	35.0
8	B. Continuous standard tools	CRISPRitz_cfd	642.0	100.0	1098.0	42.0
9	B. Continuous standard tools	MOFF	684.0	142.0	1056.0	0.0
10	C. ML tools	CRISPR-IP	683.0	138.0	1057.0	4.0
11	C. ML tools	CRISPR-OFFT	683.0	138.0	1057.0	4.0
12	C. ML tools	CRISPert	683.0	138.0	1057.0	4.0
13	C. ML tools	CnnCRISPR	683.0	138.0	1057.0	4.0

Render the Figure

Code

category_order = ["scored (false)", "scored (true)", "unscored (true)", "unscored (false)"]
colors = {
    "scored (true)": "#08306b",
    "scored (false)": "#9ecae1",
    "unscored (true)": "#7f0000",
    "unscored (false)": "#fcae91",
}
panel_tools = {
    "A. Discrete standard tools": DISCRETE_TOOLS,
    "B. Continuous standard tools": CONTINUOUS_TOOLS,
    "C. ML tools": ML_TOOLS,
}

fig, axes = plt.subplots(1, 3, figsize=(15.5, 5.4), sharey=True)
for ax, (panel, tools) in zip(axes, panel_tools.items()):
    sub = coverage[coverage["panel"] == panel]
    bottom = np.zeros(len(tools), dtype=float)
    x = np.arange(len(tools))
    for category in category_order:
        values = (
            sub[sub["category"] == category]
            .set_index("tool")
            .reindex(tools)["n_sites"]
            .fillna(0)
            .to_numpy(dtype=float)
        )
        ax.bar(x, values, bottom=bottom, color=colors[category], label=category, width=0.78)
        bottom += values
    ax.set_title(panel, loc="left", fontweight="bold")
    ax.set_xticks(x)
    ax.set_xticklabels([TOOL_LABELS.get(tool, tool) for tool in tools], rotation=45, ha="right")
    ax.set_xlabel("Prediction tool")
    ax.grid(axis="y", alpha=0.25)
axes[0].set_ylabel("CRISPRoffT unique site count")
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, loc="upper center", ncol=4, frameon=False, bbox_to_anchor=(0.5, 1.04))
fig.suptitle("Overlap between CRISPRoffT sites and sites evaluated by prediction tools", y=1.13, fontweight="bold")
fig.tight_layout(rect=[0, 0, 1, 0.98])
fig.savefig(figure_dir / "figure_5_site_coverage_stacked_counts.pdf", dpi=300, bbox_inches="tight")
fig.savefig(figure_dir / "figure_5_site_coverage_stacked_counts.png", dpi=300, bbox_inches="tight")
plt.close(fig)