From agent-skills
Extracts and normalizes metadata from scientific repositories (GEO, SRA/ENA, PRIDE, BioStudies, PubMed). Handles per-sample IDs, author names, assay types, species, file formats, and a gap-fill strategy for missing annotations.
How this skill is triggered — by the user, by Claude, or both
Slash command
/agent-skills:scientific-metadata-extractionThe summary Claude sees in its skill listing — used to decide when to auto-load this skill
Patterns for extracting and normalizing metadata from scientific data repositories (GEO, SRA/ENA, PRIDE, BioStudies, PubMed) and publication sources. Covers per-sample identifier derivation, author name normalization, assay type determination, species verification, file format normalization, and a 4-tier gap-fill strategy for resolving missing annotation fields.
Patterns for extracting and normalizing metadata from scientific data repositories (GEO, SRA/ENA, PRIDE, BioStudies, PubMed) and publication sources. Covers per-sample identifier derivation, author name normalization, assay type determination, species verification, file format normalization, and a 4-tier gap-fill strategy for resolving missing annotation fields.
pip install biopython httpx
from Bio import Entrez
import os
Entrez.email = "[email protected]"
if os.environ.get("NCBI_API_KEY"):
Entrez.api_key = os.environ["NCBI_API_KEY"] # raises limit from 3 to 10 req/s
library_strategy/library_source in repository records.Always derive investigator names from the PubMed AuthorList — first author + last/corresponding author in "Firstname Lastname" format.
from Bio import Entrez
def fetch_study_leads(pmid: str) -> list[str]:
"""Return [first_author, last_author] as 'Firstname Lastname' strings."""
handle = Entrez.efetch(db="pubmed", id=str(pmid), rettype="xml", retmode="xml")
recs = Entrez.read(handle)
authors = recs["PubmedArticle"][0]["MedlineCitation"]["Article"]["AuthorList"]
def fmt(auth) -> str:
fore = str(auth.get("ForeName", ""))
last = str(auth.get("LastName", ""))
return f"{fore} {last}".strip()
leads = [fmt(authors[0]), fmt(authors[-1])]
return list(dict.fromkeys(leads)) # deduplicate if single-author paper
# Reformat GEO-style "Lastname FI" contributors
import re
def normalize_author_name(name: str) -> str:
"""'Smith JP' → 'J Smith', 'Doe, Jane' → 'Jane Doe'"""
if "," in name:
parts = [p.strip() for p in name.split(",", 1)]
return f"{parts[1]} {parts[0]}"
tokens = name.split()
if len(tokens) == 2 and len(tokens[1]) <= 3 and tokens[1].isupper():
return f"{tokens[1][0]} {tokens[0]}"
return name
Map run accessions (SRR/ERR) to biological sample identifiers. Strip paired-end suffixes before lookup.
import httpx, re
def get_ena_filereport(accession: str) -> list[dict]:
"""Fetch per-run metadata for a study or experiment accession."""
resp = httpx.get(
"https://www.ebi.ac.uk/ena/portal/api/filereport",
params={
"accession": accession,
"result": "read_run",
"fields": ("run_accession,sample_accession,sample_title,sample_alias,"
"scientific_name,instrument_model,library_strategy,"
"library_source,library_layout,fastq_ftp,submitted_ftp"),
"format": "json",
},
timeout=30,
)
return resp.json() if resp.status_code == 200 else []
def strip_paired_suffix(filename: str) -> str:
"""'SRR12345_1.fastq.gz' → 'SRR12345'"""
stem = filename.split(".")[0]
return re.sub(r"_\d+$", "", stem)
# Build SRR → specimen_id map
filereport = get_ena_filereport("SRP123456")
srr_to_specimen = {
r["run_accession"]: r.get("sample_title") or r.get("sample_alias", "N/A")
for r in filereport
}
# Look up specimen ID for a file
srr_id = strip_paired_suffix(filename)
specimen_id = srr_to_specimen.get(srr_id, "N/A")
from Bio import Entrez
from xml.etree import ElementTree as ET
def get_biosample_attrs(biosample_acc: str) -> dict[str, str]:
"""Return attribute dict from a BioSample record (SAMN/SAME/SAMD)."""
handle = Entrez.efetch(db="biosample", id=biosample_acc, rettype="xml")
root = ET.parse(handle).getroot()
attrs = {}
for attr in root.iter("Attribute"):
name = attr.get("attribute_name") or attr.get("harmonized_name", "")
if name:
attrs[name] = attr.text
return attrs
# common keys: sex, age, tissue, disease, genotype, cell_type
from Bio import Entrez
def get_gsm_characteristics(gsm_accession: str) -> dict[str, str]:
"""Return sample characteristics for a GEO GSM record."""
handle = Entrez.efetch(db="gds", id=gsm_accession, rettype="full", retmode="text")
attrs = {}
for line in handle.read().splitlines():
if line.startswith("!Sample_characteristics_ch1"):
val = line.split("=", 1)[-1].strip()
if ":" in val:
k, v = val.split(":", 1)
attrs[k.strip().lower()] = v.strip()
return attrs
Never infer assay type from paper title — verify from repository library strategy and source fields.
SCRNA_SIGNALS = {
"transcriptomic single cell", # ENA library_source
"scrna-seq", # library_strategy variants
"10x chromium", "drop-seq", "indrop", "smart-seq2", "fluidigm c1",
}
def determine_assay(library_source: str = "", library_strategy: str = "",
protocol: str = "") -> str | None:
combined = f"{library_source} {library_strategy} {protocol}".lower()
if any(s in combined for s in SCRNA_SIGNALS):
return "single-cell RNA-seq"
if "rna" in combined or "transcriptomic" in combined:
return "RNA-seq"
if "atac" in combined:
return "ATAC-seq"
if "chip" in combined:
return "ChIP-seq"
if "wgs" in combined or "whole genome" in combined:
return "WGS"
if "wes" in combined or "exome" in combined:
return "WES"
if "bisulfite" in combined or "methyl" in combined:
return "Bisulfite-seq"
if "hi-c" in combined or "hic" in combined:
return "Hi-C"
return None # unknown — flag for human review
Strip compression suffixes before storing file format annotations.
COMPRESSION_SUFFIXES = {".gz", ".bz2", ".zip", ".zst", ".xz"}
def normalize_file_format(filename: str) -> str:
"""'sample.fastq.gz' → 'fastq', 'counts.txt.gz' → 'txt'"""
parts = filename.lower().split(".")
while parts and f".{parts[-1]}" in COMPRESSION_SUFFIXES:
parts.pop()
return parts[-1] if len(parts) > 1 else ""
When a required annotation field is missing, work through these tiers in order. Stop at the first tier that yields a valid, unambiguous value.
Tier 1 — Structured repository metadata
Primary source: ENA filereport columns, BioSample XML, GEO GSM !Sample_characteristics_ch1, SRA RunInfo.
Tier 2 — Publication metadata PMC full-text methods section, CrossRef funder info, supplementary tables (download and parse).
Tier 3 — Text extraction via reasoning Abstract and methods text — extract only unambiguous values explicitly stated (e.g., "all mice were female"). Flag in review comments when using this tier.
Tier 4 — Data file inspection
h5ad/loom .obs columns, BAM @RG tags, FASTQ headers, count matrix column headers.
Only after exhausting all four tiers should a field be marked unresolvable. Document each unresolvable field and its reason in the curation review comment.
def resolve_field(field_name: str, filereport_row: dict,
biosample_attrs: dict, gsm_chars: dict) -> str | None:
"""Tier 1 lookup across structured repository sources."""
# ENA filereport first
for key in [field_name, field_name.lower(), field_name.replace("_", " ")]:
if key in filereport_row and filereport_row[key]:
return filereport_row[key]
# BioSample attributes
for key in [field_name, field_name.lower()]:
if key in biosample_attrs and biosample_attrs[key]:
return biosample_attrs[key]
# GEO characteristics
for key in [field_name, field_name.lower()]:
if key in gsm_chars and gsm_chars[key]:
return gsm_chars[key]
return None # move to Tier 2+
scientific_name, GEO !Series_sample_taxid, or BioStudies Organism — never assume species from disease context or model name.Entrez.esummary(db='gds', ...)["PubMedIds"] matches the PMID being processed.[], no valid values exist in the current schema — do not set the field.NCBI_API_KEY. Add time.sleep(0.12) between Entrez calls without an API key.Provides UI/UX resources: 50+ styles, color palettes, font pairings, guidelines, charts for web/mobile across React, Next.js, Vue, Svelte, Tailwind, React Native, Flutter. Aids planning, building, reviewing interfaces.
Searches MemPalace before answering questions about past work, people, projects, or prior decisions. Returns verbatim stored content instead of guessing from model memory.
npx claudepluginhub sage-bionetworks/agent-skills --plugin agent-skills