Claude-code-plugins snowflake-reliability-patterns
install
source · Clone the upstream repo
git clone https://github.com/jeremylongshore/claude-code-plugins-plus-skills
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/jeremylongshore/claude-code-plugins-plus-skills "$T" && mkdir -p ~/.claude/skills && cp -r "$T/plugins/saas-packs/snowflake-pack/skills/snowflake-reliability-patterns" ~/.claude/skills/jeremylongshore-claude-code-plugins-snowflake-reliability-patterns && rm -rf "$T"
manifest:
plugins/saas-packs/snowflake-pack/skills/snowflake-reliability-patterns/SKILL.mdsource content
Snowflake Reliability Patterns
Overview
Production-grade reliability patterns for Snowflake: database replication, account failover, Time Travel recovery, and application-level circuit breakers.
Instructions
Step 1: Time Travel for Point-in-Time Recovery
-- Query historical data (up to 90 days on Enterprise Edition) SELECT * FROM orders AT (TIMESTAMP => '2026-03-21 14:00:00'::TIMESTAMP_NTZ); -- Restore a table to a previous state CREATE OR REPLACE TABLE orders CLONE orders AT (TIMESTAMP => '2026-03-21 14:00:00'::TIMESTAMP_NTZ); -- Restore a dropped table UNDROP TABLE orders; UNDROP SCHEMA my_schema; UNDROP DATABASE my_database; -- Query by offset (5 minutes ago) SELECT * FROM orders AT (OFFSET => -300); -- Query by statement ID (before a specific query ran) SELECT * FROM orders BEFORE (STATEMENT => '<query_id_of_bad_update>'); -- Set retention period per table ALTER TABLE critical_data SET DATA_RETENTION_TIME_IN_DAYS = 90; ALTER TABLE temp_staging SET DATA_RETENTION_TIME_IN_DAYS = 0; -- No Time Travel
Step 2: Database Replication Across Regions
-- Enable replication on source account (primary) ALTER DATABASE PROD_DW ENABLE REPLICATION TO ACCOUNTS myorg.us_east_account, myorg.eu_west_account; -- On target account: create replica database CREATE DATABASE PROD_DW_REPLICA AS REPLICA OF myorg.us_west_account.PROD_DW; -- Refresh replica (manual or scheduled) ALTER DATABASE PROD_DW_REPLICA REFRESH; -- Check replication status SELECT * FROM TABLE(INFORMATION_SCHEMA.DATABASE_REPLICATION_USAGE_HISTORY( DATE_RANGE_START => DATEADD(hours, -24, CURRENT_TIMESTAMP()) )); -- Check replication lag SELECT database_name, primary_snowflake_region, replication_allowed, is_primary, DATEDIFF('minute', snowflake_region_last_refresh_time, CURRENT_TIMESTAMP()) AS lag_minutes FROM TABLE(INFORMATION_SCHEMA.REPLICATION_DATABASES()) WHERE database_name = 'PROD_DW_REPLICA';
Step 3: Account Failover Groups
-- Create failover group (replicates databases, warehouses, roles, etc.) -- On primary account: CREATE FAILOVER GROUP prod_failover OBJECT_TYPES = DATABASES, WAREHOUSES, ROLES, USERS, INTEGRATIONS ALLOWED_DATABASES = PROD_DW ALLOWED_ACCOUNTS = myorg.us_east_account REPLICATION_SCHEDULE = '10 MINUTE'; -- On secondary account: create as replica CREATE FAILOVER GROUP prod_failover AS REPLICA OF myorg.us_west_account.prod_failover; -- Promote secondary to primary (during outage) ALTER FAILOVER GROUP prod_failover PRIMARY; -- After recovery, switch back -- On original primary: ALTER FAILOVER GROUP prod_failover PRIMARY;
Step 4: Application-Level Connection Failover
// src/snowflake/resilient-connection.ts import snowflake from 'snowflake-sdk'; interface FailoverConfig { primary: snowflake.ConnectionOptions; secondary: snowflake.ConnectionOptions; healthCheckIntervalMs: number; } class ResilientSnowflakeConnection { private activeConn: snowflake.Connection | null = null; private isPrimary = true; private config: FailoverConfig; constructor(config: FailoverConfig) { this.config = config; } async connect(): Promise<snowflake.Connection> { try { this.activeConn = await this.tryConnect(this.config.primary); this.isPrimary = true; return this.activeConn; } catch (primaryErr) { console.warn('Primary Snowflake connection failed, trying secondary'); try { this.activeConn = await this.tryConnect(this.config.secondary); this.isPrimary = false; return this.activeConn; } catch (secondaryErr) { throw new Error(`Both Snowflake connections failed. Primary: ${primaryErr.message} Secondary: ${secondaryErr.message}`); } } } private tryConnect(opts: snowflake.ConnectionOptions): Promise<snowflake.Connection> { return new Promise((resolve, reject) => { const conn = snowflake.createConnection(opts); conn.connect((err, conn) => (err ? reject(err) : resolve(conn))); }); } async healthCheck(): Promise<{ connected: boolean; isPrimary: boolean }> { try { await new Promise<void>((resolve, reject) => { this.activeConn!.execute({ sqlText: 'SELECT 1', complete: (err) => (err ? reject(err) : resolve()), }); }); return { connected: true, isPrimary: this.isPrimary }; } catch { return { connected: false, isPrimary: this.isPrimary }; } } } // Usage const resilientConn = new ResilientSnowflakeConnection({ primary: { account: 'myorg-primary', username: process.env.SNOWFLAKE_USER!, password: process.env.SNOWFLAKE_PASSWORD!, warehouse: 'PROD_WH', database: 'PROD_DW', }, secondary: { account: 'myorg-secondary', username: process.env.SNOWFLAKE_USER!, password: process.env.SNOWFLAKE_PASSWORD!, warehouse: 'PROD_WH', database: 'PROD_DW_REPLICA', }, healthCheckIntervalMs: 30000, });
Step 5: Pipeline Retry and Idempotency
-- Idempotent data loading (safe to retry) MERGE INTO silver.orders AS target USING ( SELECT * FROM bronze.raw_orders WHERE ingestion_time >= DATEADD(hours, -1, CURRENT_TIMESTAMP()) ) AS source ON target.order_id = source.order_id WHEN NOT MATCHED THEN INSERT (order_id, customer_id, amount, order_date) VALUES (source.order_id, source.customer_id, source.amount, source.order_date); -- MERGE is idempotent: running it twice with the same data produces the same result -- Prefer MERGE over INSERT for retry-safe pipelines -- Task retry configuration CREATE OR REPLACE TASK reliable_transform WAREHOUSE = ETL_WH SCHEDULE = '5 MINUTE' ALLOW_OVERLAPPING_EXECUTION = FALSE -- Prevent concurrent runs SUSPEND_TASK_AFTER_NUM_FAILURES = 3 -- Auto-suspend after 3 failures WHEN SYSTEM$STREAM_HAS_DATA('orders_stream') AS MERGE INTO dim_orders ...; ALTER TASK reliable_transform RESUME;
Step 6: Backup Strategy
-- Zero-copy clone backups (instant, no extra storage until data changes) CREATE DATABASE PROD_DW_BACKUP_20260322 CLONE PROD_DW; -- Automated daily backup via task CREATE OR REPLACE TASK daily_backup WAREHOUSE = ADMIN_WH SCHEDULE = 'USING CRON 0 3 * * * UTC' AS BEGIN LET backup_name VARCHAR := 'PROD_DW_BACKUP_' || TO_CHAR(CURRENT_DATE(), 'YYYYMMDD'); EXECUTE IMMEDIATE 'CREATE DATABASE IF NOT EXISTS ' || :backup_name || ' CLONE PROD_DW'; -- Clean up backups older than 7 days -- (done via separate cleanup task or stored procedure) END;
Recovery Time Objectives
| Scenario | Recovery Method | RTO | RPO |
|---|---|---|---|
| Accidental table drop | UNDROP | Instant | Zero |
| Bad data update | Time Travel CLONE | Minutes | Configurable (up to 90 days) |
| Regional outage | Failover group | 10-30 min | Replication lag |
| Account compromise | Contact Snowflake support | Hours | Last backup |
Error Handling
| Issue | Cause | Solution |
|---|---|---|
| Time Travel expired | Past retention period | Increase DATA_RETENTION_TIME_IN_DAYS |
| Replication lag high | Large data changes | Check replication schedule, network |
| Failover fails | Secondary not synced | Verify failover group status |
| UNDROP fails | Object recreated with same name | Rename current object first |
Resources
Next Steps
For policy enforcement, see
snowflake-policy-guardrails.