Vibeship-spawner-skills drug-discovery-informatics

Drug Discovery Informatics Skill

install
source · Clone the upstream repo
git clone https://github.com/vibeforge1111/vibeship-spawner-skills
manifest: biotech/drug-discovery-informatics/skill.yaml
source content

Drug Discovery Informatics Skill

Computational approaches for drug discovery and development

id: drug-discovery-informatics name: Drug Discovery Informatics category: biotech complexity: advanced requires_skills:

  • protein-structure
  • statistical-analysis

description: | Patterns for computer-aided drug discovery including virtual screening, molecular docking, ADMET prediction, lead optimization, and integration with AI/ML methods. Covers both structure-based and ligand-based approaches.

patterns:

virtual_screening: name: Virtual Screening Workflow description: Screen large compound libraries against targets when: "Need to identify hit compounds from large databases" pattern: | from pathlib import Path from dataclasses import dataclass from typing import List, Optional, Tuple import pandas as pd

  @dataclass
  class VirtualScreeningConfig:
      """Configuration for virtual screening campaign."""
      library_path: Path
      target_structure: Path
      binding_site: List[Tuple[float, float, float]]  # Center coordinates
      box_size: Tuple[float, float, float] = (20, 20, 20)
      exhaustiveness: int = 8
      num_modes: int = 9
      energy_cutoff: float = 3.0  # kcal/mol from best

  def prepare_receptor(
      pdb_file: Path,
      output_pdbqt: Path,
      remove_waters: bool = True,
      add_hydrogens: bool = True
  ) -> Path:
      """
      Prepare receptor for docking.

      Steps:
      1. Remove waters and heteroatoms (optional)
      2. Add hydrogens at pH 7.4
      3. Assign Gasteiger charges
      4. Convert to PDBQT format
      """
      from openbabel import openbabel

      conv = openbabel.OBConversion()
      conv.SetInAndOutFormats("pdb", "pdbqt")

      mol = openbabel.OBMol()
      conv.ReadFile(mol, str(pdb_file))

      if remove_waters:
          # Remove water molecules
          for residue in list(mol.GetResidues()):
              if residue.GetName() == "HOH":
                  mol.DeleteResidue(residue)

      if add_hydrogens:
          mol.AddHydrogens()

      # Assign charges
      charge_model = openbabel.OBChargeModel.FindType("gasteiger")
      charge_model.ComputeCharges(mol)

      conv.WriteFile(mol, str(output_pdbqt))
      return output_pdbqt

  def run_vina_screen(
      receptor: Path,
      ligand_library: Path,
      config: VirtualScreeningConfig,
      output_dir: Path,
      n_cpus: int = 8
  ) -> pd.DataFrame:
      """
      Run AutoDock Vina virtual screening.

      Returns DataFrame with compound IDs and docking scores.
      """
      import subprocess

      output_dir.mkdir(parents=True, exist_ok=True)

      results = []
      ligands = list(ligand_library.glob("*.pdbqt"))

      for ligand in ligands:
          output_file = output_dir / f"{ligand.stem}_docked.pdbqt"
          log_file = output_dir / f"{ligand.stem}.log"

          cmd = [
              "vina",
              "--receptor", str(receptor),
              "--ligand", str(ligand),
              "--out", str(output_file),
              "--log", str(log_file),
              "--center_x", str(config.binding_site[0][0]),
              "--center_y", str(config.binding_site[0][1]),
              "--center_z", str(config.binding_site[0][2]),
              "--size_x", str(config.box_size[0]),
              "--size_y", str(config.box_size[1]),
              "--size_z", str(config.box_size[2]),
              "--exhaustiveness", str(config.exhaustiveness),
              "--num_modes", str(config.num_modes),
              "--cpu", str(n_cpus)
          ]

          subprocess.run(cmd, check=True, capture_output=True)

          # Parse score from log
          score = parse_vina_log(log_file)
          results.append({
              'compound_id': ligand.stem,
              'docking_score': score,
              'output_file': str(output_file)
          })

      return pd.DataFrame(results).sort_values('docking_score')
why: "Virtual screening reduces experimental costs by 100-1000x"

molecular_docking: name: Molecular Docking Best Practices description: Dock ligands to protein targets accurately pattern: | from pathlib import Path from dataclasses import dataclass import subprocess

  @dataclass
  class DockingResult:
      """Results from docking calculation."""
      ligand_id: str
      score: float
      pose_file: Path
      interactions: dict
      rmsd_from_input: Optional[float] = None

  def dock_with_gnina(
      receptor: Path,
      ligand: Path,
      output: Path,
      binding_site: Tuple[float, float, float],
      box_size: float = 20,
      exhaustiveness: int = 16
  ) -> DockingResult:
      """
      Dock using GNINA (CNN-based scoring).

      GNINA often outperforms classical scoring functions.
      """
      cmd = [
          "gnina",
          "--receptor", str(receptor),
          "--ligand", str(ligand),
          "--out", str(output),
          "--center_x", str(binding_site[0]),
          "--center_y", str(binding_site[1]),
          "--center_z", str(binding_site[2]),
          "--size", str(box_size),
          "--exhaustiveness", str(exhaustiveness),
          "--cnn_scoring", "rescore",
          "--cnn", "crossdock_default2018"
      ]

      result = subprocess.run(cmd, capture_output=True, text=True)

      # Parse CNN score
      # ...
      return DockingResult(...)

  # Covalent docking for covalent inhibitors
  def dock_covalent(
      receptor: Path,
      ligand: Path,
      reactive_residue: str,  # e.g., "CYS145"
      warhead_type: str = "acrylamide"
  ) -> DockingResult:
      """
      Covalent docking for covalent drug discovery.

      CRITICAL: Standard docking CANNOT handle covalent bonds.
      Use specialized tools: CovDock, DOCKovalent, GOLD covalent.
      """
      # Using AutoDock-GPU covalent mode or CovDock
      # ...
      pass

  # Pose analysis and validation
  def analyze_binding_interactions(
      complex_pdb: Path
  ) -> dict:
      """
      Analyze protein-ligand interactions.

      Returns hydrogen bonds, hydrophobic contacts,
      pi-stacking, salt bridges.
      """
      from plip.structure.preparation import PDBComplex

      complex_obj = PDBComplex()
      complex_obj.load_pdb(str(complex_pdb))
      complex_obj.analyze()

      interactions = {}
      for ligand_id, binding_site in complex_obj.interaction_sets.items():
          interactions[ligand_id] = {
              'h_bonds': binding_site.hbonds_ldon + binding_site.hbonds_pdon,
              'hydrophobic': binding_site.hydrophobic_contacts,
              'pi_stacking': binding_site.pistacking,
              'salt_bridges': binding_site.saltbridge_lneg + binding_site.saltbridge_pneg
          }

      return interactions
why: "Accurate docking predicts binding modes for lead optimization"

admet_prediction: name: ADMET Property Prediction description: Predict absorption, distribution, metabolism, excretion, toxicity critical: true pattern: | from rdkit import Chem from rdkit.Chem import Descriptors, Lipinski, QED import pandas as pd from dataclasses import dataclass

  @dataclass
  class ADMETProfile:
      """ADMET property predictions for a compound."""
      # Physicochemical
      molecular_weight: float
      logp: float
      tpsa: float
      h_bond_donors: int
      h_bond_acceptors: int
      rotatable_bonds: int

      # Drug-likeness
      lipinski_violations: int
      qed_score: float
      brenk_alerts: int

      # Predicted properties
      absorption_class: str  # High/Medium/Low
      bbb_penetration: bool
      pgp_substrate: bool
      cyp_inhibition: dict  # CYP1A2, CYP2C19, etc.
      herg_liability: str  # High/Medium/Low

      # Toxicity
      ames_mutagenic: bool
      hepatotoxicity_risk: str
      carcinogenicity_risk: str

  def calculate_physicochemical(smiles: str) -> dict:
      """Calculate basic physicochemical properties."""
      mol = Chem.MolFromSmiles(smiles)
      if mol is None:
          raise ValueError(f"Invalid SMILES: {smiles}")

      return {
          'molecular_weight': Descriptors.MolWt(mol),
          'logp': Descriptors.MolLogP(mol),
          'tpsa': Descriptors.TPSA(mol),
          'h_bond_donors': Descriptors.NumHDonors(mol),
          'h_bond_acceptors': Descriptors.NumHAcceptors(mol),
          'rotatable_bonds': Descriptors.NumRotatableBonds(mol),
          'aromatic_rings': Descriptors.NumAromaticRings(mol),
          'fraction_sp3': Descriptors.FractionCSP3(mol)
      }

  def check_lipinski_rule_of_5(smiles: str) -> dict:
      """
      Lipinski's Rule of Five for oral bioavailability.

      Violations:
      - MW > 500
      - LogP > 5
      - H-bond donors > 5
      - H-bond acceptors > 10
      """
      mol = Chem.MolFromSmiles(smiles)

      violations = 0
      details = {}

      mw = Descriptors.MolWt(mol)
      if mw > 500:
          violations += 1
          details['mw'] = f"{mw:.1f} > 500"

      logp = Descriptors.MolLogP(mol)
      if logp > 5:
          violations += 1
          details['logp'] = f"{logp:.2f} > 5"

      hbd = Descriptors.NumHDonors(mol)
      if hbd > 5:
          violations += 1
          details['hbd'] = f"{hbd} > 5"

      hba = Descriptors.NumHAcceptors(mol)
      if hba > 10:
          violations += 1
          details['hba'] = f"{hba} > 10"

      return {
          'violations': violations,
          'passes': violations <= 1,  # One violation allowed
          'details': details
      }

  def predict_admet_ml(smiles: str) -> ADMETProfile:
      """
      ML-based ADMET prediction.

      Uses models like ADMETlab, SwissADME, pkCSM.
      """
      # Using ADMETlab or similar
      # Many tools available as web services or local models

      # Example with admetSAR predictions
      # ...
      pass

  # PAINS filter for reactive compounds
  def check_pains(smiles: str) -> list:
      """
      Check for Pan-Assay INterference compoundS.

      PAINS are frequent hitters that give false positives.
      """
      from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams

      mol = Chem.MolFromSmiles(smiles)
      params = FilterCatalogParams()
      params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS)
      catalog = FilterCatalog(params)

      matches = []
      for match in catalog.GetMatches(mol):
          matches.append(match.GetDescription())

      return matches
why: "ADMET failures cause 50% of clinical drug development failures"

lead_optimization: name: Lead Optimization Strategies description: Optimize hits to leads with better properties pattern: | from rdkit import Chem from rdkit.Chem import AllChem, rdFMCS import pandas as pd

  def matched_molecular_pairs(
      compounds: pd.DataFrame,
      smiles_col: str = 'smiles',
      activity_col: str = 'ic50'
  ) -> pd.DataFrame:
      """
      Find matched molecular pairs for SAR analysis.

      Identifies structural transformations that improve activity.
      """
      from mmpdb import fragment, index

      # Generate fragments and find pairs
      # ...
      pass

  def bioisosteric_replacement(
      smiles: str,
      replacement_type: str = 'carboxylic_acid'
  ) -> list:
      """
      Suggest bioisosteric replacements.

      Common bioisosteres:
      - COOH → tetrazole, sulfonamide
      - Phenyl → pyridine, thiophene
      - Ester → amide, reverse amide
      """
      bioisosteres = {
          'carboxylic_acid': [
              'C(=O)O >> c1nnn[nH]1',  # Tetrazole
              'C(=O)O >> S(=O)(=O)N',  # Sulfonamide
              'C(=O)O >> P(=O)(O)O',   # Phosphonate
          ],
          'phenyl': [
              'c1ccccc1 >> c1ccncc1',  # Pyridine
              'c1ccccc1 >> c1ccsc1',   # Thiophene
          ]
      }

      mol = Chem.MolFromSmiles(smiles)
      replacements = []

      for rxn_smarts in bioisosteres.get(replacement_type, []):
          rxn = AllChem.ReactionFromSmarts(rxn_smarts)
          products = rxn.RunReactants((mol,))
          for product in products:
              replacements.append(Chem.MolToSmiles(product[0]))

      return replacements

  def multiparameter_optimization(
      compounds: pd.DataFrame,
      objectives: dict,
      weights: dict = None
  ) -> pd.DataFrame:
      """
      Multi-parameter optimization (MPO).

      Balance potency, selectivity, ADMET properties.
      """
      # Calculate MPO score
      # Common: CNS MPO, Pfizer 3/75 rule

      if weights is None:
          weights = {obj: 1.0 for obj in objectives}

      def score_compound(row):
          total = 0
          for prop, (target, direction) in objectives.items():
              value = row[prop]
              if direction == 'minimize':
                  score = 1 - min(value / target, 1)
              else:  # maximize
                  score = min(value / target, 1)
              total += score * weights[prop]
          return total / sum(weights.values())

      compounds['mpo_score'] = compounds.apply(score_compound, axis=1)
      return compounds.sort_values('mpo_score', ascending=False)
why: "Systematic optimization improves success rate in drug development"

generative_design: name: AI-Driven Molecule Generation description: Generate novel molecules with desired properties pattern: | from typing import List, Callable from dataclasses import dataclass

  @dataclass
  class GenerationConfig:
      """Configuration for generative molecule design."""
      target_properties: dict  # Property: target value
      num_generations: int = 1000
      population_size: int = 100
      mutation_rate: float = 0.1

  def reinvent_generate(
      scoring_function: Callable,
      prior_model: str,
      num_molecules: int = 1000
  ) -> List[str]:
      """
      Generate molecules using REINVENT.

      Uses reinforcement learning to optimize for
      desired properties while maintaining drug-likeness.
      """
      # REINVENT is a popular generative model
      # https://github.com/MolecularAI/REINVENT4
      pass

  def generate_with_chembl(
      seed_molecule: str,
      target_activity: float,
      target_protein: str
  ) -> List[str]:
      """
      Generate analogs using ChEMBL data.

      Finds similar active compounds and combines features.
      """
      from chembl_webresource_client.new_client import new_client

      # Search for similar actives
      similarity = new_client.similarity
      similar_compounds = similarity.filter(
          smiles=seed_molecule,
          similarity=70
      )

      # Filter by activity against target
      # ...
      pass

  # De novo design with genetic algorithms
  def genetic_algorithm_design(
      config: GenerationConfig,
      fitness_function: Callable
  ) -> List[str]:
      """
      Genetic algorithm for molecule optimization.
      """
      from rdkit import Chem
      from rdkit.Chem import AllChem

      # SELFIES for robust molecule representation
      import selfies as sf

      # Initialize population
      # Mutate, crossover, select
      # ...
      pass
why: "Generative AI explores chemical space beyond known compounds"

anti_patterns:

ignoring_admet: name: Optimizing Potency Without ADMET problem: "Highly potent compound fails in vivo due to poor ADMET" solution: "Balance potency with ADMET from the start"

single_scoring_function: name: Trusting Single Docking Score problem: "Docking scores poorly correlate with binding affinity" solution: "Use consensus scoring, validate with orthogonal methods"

standard_docking_for_covalent: name: Using Non-Covalent Docking for Covalent Targets problem: "Standard docking cannot model covalent bond formation" solution: "Use CovDock, DOCKovalent, or GOLD covalent mode"

handoffs:

  • to: protein-structure when: "Need target structure for docking" pass: "Target protein sequence or PDB ID"

  • to: clinical-trial-analysis when: "Lead compound ready for clinical development" pass: "Compound profile, preclinical data"

  • to: statistical-analysis when: "Need SAR analysis or QSAR modeling" pass: "Activity data, compound structures"

ecosystem: docking: - "AutoDock Vina - Fast and accurate" - "GNINA - CNN-based scoring" - "Glide - Schrodinger commercial" - "GOLD - Genetic algorithm docking"

virtual_screening: - "OpenEye ROCS - Shape-based screening" - "Pharmit - Pharmacophore search" - "LigandScout - Pharmacophore modeling"

admet_prediction: - "ADMETlab 2.0 - Comprehensive prediction" - "SwissADME - Free web tool" - "pkCSM - Pharmacokinetics prediction" - "admetSAR - Toxicity prediction"

generative: - "REINVENT - RL-based generation" - "MolGPT - Transformer-based" - "ChemBERTa - Molecular BERT" - "GENTRL - Generative tensorial RL"

databases: - "ChEMBL - Bioactivity database" - "PubChem - Chemical compounds" - "ZINC - Commercially available compounds" - "Enamine REAL - Synthesizable compounds"