Source code for simi_search.metrics

"""Ranking metrics for virtual-screening benchmarks."""

from __future__ import annotations

import math
from collections.abc import Sequence
from dataclasses import dataclass



[docs]
@dataclass(frozen=True)
class RankingMetrics:
    compounds: int
    actives: int
    ef1: float
    ef5: float
    bedroc20: float
    roc_auc: float
    pr_auc: float
    top1_actives: int
    top5_actives: int



def _top_fraction_size(total: int, fraction: float) -> int:
    if total <= 0:
        return 0
    return max(1, math.ceil(total * fraction))



[docs]
def enrichment_factor(labels: Sequence[int], fraction: float) -> tuple[float, int]:
    total = len(labels)
    actives = sum(labels)
    if total == 0 or actives == 0:
        return 0.0, 0
    top_k = _top_fraction_size(total, fraction)
    top_actives = sum(labels[:top_k])
    expected = top_k * (actives / total)
    return top_actives / expected if expected else 0.0, top_actives




[docs]
def roc_auc(labels: Sequence[int], scores: Sequence[float]) -> float:
    positives = sum(labels)
    negatives = len(labels) - positives
    if positives == 0 or negatives == 0:
        return 0.0

    pairs = sorted(zip(scores, labels, strict=True), key=lambda item: item[0])
    rank_sum = 0.0
    rank = 1
    index = 0
    while index < len(pairs):
        end = index + 1
        while end < len(pairs) and pairs[end][0] == pairs[index][0]:
            end += 1
        average_rank = (rank + rank + (end - index) - 1) / 2.0
        rank_sum += average_rank * sum(label for _, label in pairs[index:end])
        rank += end - index
        index = end

    return (rank_sum - positives * (positives + 1) / 2.0) / (positives * negatives)




[docs]
def pr_auc(labels: Sequence[int]) -> float:
    positives = sum(labels)
    if positives == 0:
        return 0.0

    area = 0.0
    true_positives = 0
    previous_recall = 0.0
    for rank, label in enumerate(labels, start=1):
        if label:
            true_positives += 1
            recall = true_positives / positives
            precision = true_positives / rank
            area += (recall - previous_recall) * precision
            previous_recall = recall
    return area




[docs]
def bedroc(labels: Sequence[int], alpha: float = 20.0) -> float:
    """Compute BEDROC using the Truchon/Jain early-recognition formula."""

    total = len(labels)
    actives = sum(labels)
    if total == 0 or actives == 0 or actives == total:
        return 0.0

    active_ranks = [index for index, label in enumerate(labels, start=1) if label]
    rie_numerator = sum(math.exp(-alpha * rank / total) for rank in active_ranks)
    rie_denominator = (actives / total) * (1.0 - math.exp(-alpha)) / (math.exp(alpha / total) - 1.0)
    rie = rie_numerator / rie_denominator if rie_denominator else 0.0

    ratio = actives / total
    rie_max = (1.0 - math.exp(-alpha * ratio)) / (ratio * (1.0 - math.exp(-alpha)))
    rie_min = (1.0 - math.exp(alpha * ratio)) / (ratio * (1.0 - math.exp(alpha)))
    if rie_max == rie_min:
        return 0.0
    return max(0.0, min(1.0, (rie - rie_min) / (rie_max - rie_min)))




[docs]
def compute_ranking_metrics(labels: Sequence[int], scores: Sequence[float]) -> RankingMetrics:
    ranked = sorted(zip(scores, labels, strict=True), key=lambda item: item[0], reverse=True)
    ranked_scores = [score for score, _ in ranked]
    ranked_labels = [label for _, label in ranked]
    ef1, top1_actives = enrichment_factor(ranked_labels, 0.01)
    ef5, top5_actives = enrichment_factor(ranked_labels, 0.05)
    return RankingMetrics(
        compounds=len(ranked_labels),
        actives=sum(ranked_labels),
        ef1=ef1,
        ef5=ef5,
        bedroc20=bedroc(ranked_labels, alpha=20.0),
        roc_auc=roc_auc(ranked_labels, ranked_scores),
        pr_auc=pr_auc(ranked_labels),
        top1_actives=top1_actives,
        top5_actives=top5_actives,
    )