git clone https://github.com/vibeforge1111/vibeship-spawner-skills
ai/nlp-advanced/skill.yamlid: nlp-advanced name: Advanced NLP category: ai description: Use when extracting structured information from text - named entity recognition, relation extraction, coreference resolution, knowledge graph construction, and information extraction pipelines
patterns: golden_rules: - rule: "Pipeline order matters" reason: "NER → Coreference → Relation Extraction" - rule: "Context window affects accuracy" reason: "Longer context = better entity disambiguation" - rule: "Joint models > cascading" reason: "End-to-end avoids error propagation" - rule: "Domain matters enormously" reason: "Biomedical NER ≠ News NER" - rule: "LLMs are now competitive" reason: "GPT-4 level NER rivals fine-tuned BERT" - rule: "Evaluation must be strict" reason: "Exact match vs partial match metrics differ"
task_landscape: named_entity_recognition: description: "Find & classify entities" example: "'Apple acquired Beats' → Apple(ORG), Beats(ORG)" models: ["BERT-NER", "SpaCy", "LLMs"] relation_extraction: description: "Find relationships between entities" example: "Apple --acquired--> Beats" models: ["BERT-RE", "Joint models"] coreference_resolution: description: "Group mentions of same entity" example: "'Apple... The company... It...' → all same" models: ["SpaCy coref", "Longformer"] entity_linking: description: "Link to knowledge base" example: "Apple → Q312 (Wikidata)" models: ["BLINK", "REL"] event_extraction: description: "Extract structured events" output: "Trigger, arguments, roles" knowledge_graph: description: "Build entity-relation graph" output: "(subject, predicate, object) triples"
ner_approaches: transformers: description: "Fine-tuned BERT models" models: - "dslim/bert-base-NER (General)" - "dmis-lab/biobert-v1.1 (Biomedical)" - "Jean-Baptiste/roberta-large-ner-english (High accuracy)" pros: ["Accurate", "Customizable"] cons: ["Needs training data", "Domain-specific"] spacy: description: "Fast production NER" models: - "en_core_web_sm (Small)" - "en_core_web_lg (Large)" - "en_core_web_trf (Transformer)" pros: ["Fast", "Easy to use", "Good default"] cons: ["Less accurate than fine-tuned BERT"] llm_zero_shot: description: "Prompt-based NER" pros: ["No training data", "Any entity type"] cons: ["Slower", "More expensive", "Less consistent"]
label_schemes: bio: description: "Begin-Inside-Outside" example: "B-PER I-PER O B-ORG" bilou: description: "Begin-Inside-Last-Outside-Unit" example: "B-PER I-PER L-PER O U-ORG" advantage: "Better for short entities"
anti_patterns:
- pattern: "Ignoring subword tokenization" problem: "Misaligned labels" solution: "Proper label alignment for BERT"
- pattern: "Separate NER and RE models" problem: "Error propagation" solution: "Joint models"
- pattern: "Ignoring coreference" problem: "Missing relations" solution: "Coref resolution first"
- pattern: "Domain mismatch" problem: "Poor performance" solution: "Domain-specific fine-tuning"
- pattern: "Exact match only evaluation" problem: "Missing partial matches" solution: "Multiple evaluation metrics"
- pattern: "No entity linking" problem: "Ambiguous entities" solution: "Link to knowledge base"
implementation_checklist: ner: - "Label scheme chosen (BIO, BILOU)" - "Subword alignment handled" - "Domain-appropriate model selected" - "Evaluation uses seqeval (entity-level)" relation_extraction: - "Entity marking strategy defined" - "Relation schema defined" - "Negative sampling handled" - "Joint vs pipeline decision made" full_pipeline: - "Order: Coref → NER → RE" - "Entity deduplication implemented" - "Confidence thresholds set" - "Knowledge graph output format chosen"
handoffs:
- skill: llm-fine-tuning trigger: "domain-specific NLP model training"
- skill: transformer-architecture trigger: "custom NLP architecture"
- skill: computer-vision-deep trigger: "multi-modal document understanding"
ecosystem: ner_libraries: - "HuggingFace Transformers" - "SpaCy" - "Flair" - "Stanza" relation_extraction: - "OpenNRE" - "DeepKE" - "REBEL" knowledge_graphs: - "NetworkX" - "RDFLib" - "Neo4j" - "Wikidata" evaluation: - "seqeval" - "nervaluate"
sources: surveys: - "Relation Extraction Survey 2025" - "Information Extraction Pipelines Survey" tutorials: - "HuggingFace NER Guide" - "SpaCy NER Documentation"