"""Per-file FASTA motif counter (intended for parallel use via **concurrent.futures**)."""
from __future__ import annotations
import os
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from .fmindex import FMIndex
__all__ = ["process_fasta_file"]
def _count_in_fm(fm: FMIndex, motif: bytes, *, allow_overlap: bool = False) -> int:
if allow_overlap:
return fm.count(motif)
starts = sorted(fm.locate(motif))
m = len(motif)
last, non_overlap = -m, 0
for s in starts:
if s >= last + m:
non_overlap += 1
last = s
return non_overlap
[docs]
def process_fasta_file(
fasta_path: str | os.PathLike,
pattern_to_motifs: dict[str, list[str]],
*,
out_dir: str | os.PathLike = "bioquik_results",
) -> str:
"""Count motifs in *fasta_path* and save CSV → *out_dir*.
Returns the output CSV filepath.
"""
fasta_path = Path(fasta_path)
seq = "".join(
line.strip().upper()
for line in fasta_path.read_text().splitlines()
if not line.startswith(">")
)
tqdm.write(f" Building FM-index for {fasta_path.name} …")
fm = FMIndex(seq)
results: list[dict[str, str | int]] = []
for pattern_key, motif_list in pattern_to_motifs.items():
for motif in motif_list:
c = _count_in_fm(fm, motif.encode(), allow_overlap=False)
if c:
results.append({"Pattern": pattern_key, "Motif": motif, "Count": c})
out_path = Path(out_dir) / f"{fasta_path.stem}_motif_counts.csv"
out_path.parent.mkdir(parents=True, exist_ok=True)
pd.DataFrame(
results,
columns=["Pattern", "Motif", "Count"],
).to_csv(out_path, index=False)
tqdm.write(f"→ {fasta_path} → {out_path.name}")
return str(out_path)