Asi bigquery
install
source · Clone the upstream repo
git clone https://github.com/plurigrid/asi
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/plurigrid/asi "$T" && mkdir -p ~/.claude/skills && cp -r "$T/plugins/asi/skills/bigquery" ~/.claude/skills/plurigrid-asi-bigquery && rm -rf "$T"
manifest:
plugins/asi/skills/bigquery/SKILL.mdsource content
BigQuery Expert Engineer Skill
This skill provides a comprehensive guide for BigQuery development.
1. bq Command Line Tool Basics
1.1. Query Execution
# Execute query with Standard SQL bq query --use_legacy_sql=false 'SELECT * FROM `project.dataset.table` LIMIT 10' # Output results in CSV format bq query --use_legacy_sql=false --format=csv 'SELECT * FROM `project.dataset.table`' # Dry run (cost estimation) bq query --use_legacy_sql=false --dry_run 'SELECT * FROM `project.dataset.table`' # Save results to table bq query --use_legacy_sql=false --destination_table=project:dataset.result_table 'SELECT * FROM `project.dataset.table`'
1.2. Table Operations
# List tables bq ls project:dataset # Check table schema bq show --schema --format=prettyjson project:dataset.table # Create table (from schema file) bq mk --table project:dataset.table schema.json # Create partitioned table bq mk --table --time_partitioning_field=created_at project:dataset.table schema.json # Create clustered table bq mk --table --clustering_fields=user_id,category project:dataset.table schema.json # Delete table bq rm -t project:dataset.table
1.3. Data Load/Export
# Load from CSV bq load --source_format=CSV project:dataset.table gs://bucket/data.csv schema.json # Load from JSON bq load --source_format=NEWLINE_DELIMITED_JSON project:dataset.table gs://bucket/data.json # Load from Parquet (auto-detect schema) bq load --source_format=PARQUET --autodetect project:dataset.table gs://bucket/data.parquet # Export to Cloud Storage bq extract --destination_format=CSV project:dataset.table gs://bucket/export/*.csv
2. GoogleSQL Basic Syntax
2.1. SELECT Statement
-- Basic SELECT SELECT column1, column2, COUNT(*) AS count FROM `project.dataset.table` WHERE date >= '2024-01-01' GROUP BY column1, column2 HAVING COUNT(*) > 10 ORDER BY count DESC LIMIT 100
2.2. Common Functions
-- String functions CONCAT(str1, str2) LOWER(str), UPPER(str) TRIM(str), LTRIM(str), RTRIM(str) SUBSTR(str, start, length) REGEXP_CONTAINS(str, r'pattern') REGEXP_EXTRACT(str, r'pattern') SPLIT(str, delimiter) -- Date/time functions CURRENT_DATE(), CURRENT_TIMESTAMP() DATE(timestamp), TIMESTAMP(date) DATE_ADD(date, INTERVAL 1 DAY) DATE_DIFF(date1, date2, DAY) FORMAT_DATE('%Y-%m-%d', date) PARSE_DATE('%Y%m%d', str) EXTRACT(YEAR FROM date) -- Aggregate functions COUNT(*), COUNT(DISTINCT column) SUM(column), AVG(column) MIN(column), MAX(column) ARRAY_AGG(column) STRING_AGG(column, ',') -- Window functions ROW_NUMBER() OVER (PARTITION BY col ORDER BY col2) RANK() OVER (ORDER BY col DESC) LAG(col, 1) OVER (ORDER BY date) LEAD(col, 1) OVER (ORDER BY date) SUM(col) OVER (PARTITION BY category)
2.3. JOIN Syntax
-- INNER JOIN SELECT a.*, b.column FROM `project.dataset.table_a` AS a INNER JOIN `project.dataset.table_b` AS b ON a.id = b.id -- LEFT JOIN SELECT a.*, b.column FROM `project.dataset.table_a` AS a LEFT JOIN `project.dataset.table_b` AS b ON a.id = b.id -- CROSS JOIN (commonly used for array expansion) SELECT * FROM `project.dataset.table`, UNNEST(array_column) AS element
2.4. CTE (Common Table Expressions)
WITH base_data AS ( SELECT * FROM `project.dataset.table` WHERE date >= '2024-01-01' ), aggregated AS ( SELECT category, COUNT(*) AS count FROM base_data GROUP BY category ) SELECT * FROM aggregated ORDER BY count DESC
3. Table Design
3.1. Partitioning
Divide data by date to reduce query scan volume.
-- Create date-partitioned table CREATE TABLE `project.dataset.partitioned_table` PARTITION BY DATE(created_at) AS SELECT * FROM `project.dataset.source_table`; -- Integer partitioning CREATE TABLE `project.dataset.int_partitioned` PARTITION BY RANGE_BUCKET(user_id, GENERATE_ARRAY(0, 1000000, 10000)) AS SELECT * FROM source; -- Require partition filter CREATE TABLE `project.dataset.table` PARTITION BY DATE(created_at) OPTIONS ( require_partition_filter = TRUE );
3.2. Clustering
Sort and group data by specified columns.
-- Clustering table CREATE TABLE `project.dataset.clustered_table` PARTITION BY DATE(created_at) CLUSTER BY user_id, category AS SELECT * FROM source;
3.3. Best Practices
- Combine partitioning and clustering
- Choose columns frequently filtered in queries
- Maximum 4 clustering columns
- Prioritize high-cardinality columns
4. Performance Optimization
4.1. Query Optimization
-- Avoid SELECT * -- Bad SELECT * FROM table; -- Good SELECT column1, column2 FROM table; -- Leverage partition pruning -- Bad (function applied to partition column) WHERE DATE(created_at) = '2024-01-01' -- Good WHERE created_at >= '2024-01-01' AND created_at < '2024-01-02' -- Use APPROX_ functions for estimates (faster) SELECT APPROX_COUNT_DISTINCT(user_id) FROM table;
4.2. JOIN Optimization
-- Put smaller table on right side (broadcast JOIN) SELECT * FROM large_table JOIN small_table ON large_table.id = small_table.id; -- JOIN only needed columns WITH filtered AS ( SELECT id, needed_column FROM large_table WHERE condition ) SELECT * FROM filtered JOIN other_table ON ...
4.3. Check Slot Usage
-- Check job statistics SELECT job_id, total_bytes_processed, total_slot_ms, TIMESTAMP_DIFF(end_time, start_time, SECOND) AS duration_sec FROM `region-us`.INFORMATION_SCHEMA.JOBS WHERE creation_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1 DAY) ORDER BY total_slot_ms DESC LIMIT 10;
5. Cost Management
5.1. Pricing Model
- On-demand: Based on scanned data ($5/TB)
- Flat-rate (Editions): Based on reserved slots
- Storage: Active $0.02/GB, Long-term $0.01/GB
5.2. Cost Reduction Best Practices
- Avoid SELECT *
- Always use partition filters
- Check cost with dry run before queries
- Optimize repeated queries with materialized views
- Speed up dashboard queries with BI Engine
5.3. Custom Quota Settings
-- Set query byte limit per project -- Configure in Cloud Console or gcloud
6. Data Governance
6.1. IAM Roles
: Full permissionsroles/bigquery.admin
: Read/write dataroles/bigquery.dataEditor
: Read-only dataroles/bigquery.dataViewer
: Execute jobsroles/bigquery.jobUser
: List datasets, execute jobsroles/bigquery.user
6.2. Column-level Security
-- Apply policy tag ALTER TABLE `project.dataset.table` ALTER COLUMN sensitive_column SET OPTIONS (policy_tags = ['projects/project/locations/us/taxonomies/123/policyTags/456']);
6.3. Row-level Security
-- Create row access policy CREATE ROW ACCESS POLICY region_filter ON `project.dataset.table` GRANT TO ('user:analyst@example.com') FILTER USING (region = 'APAC');
7. BigQuery ML
7.1. Model Creation
-- Linear regression model CREATE OR REPLACE MODEL `project.dataset.model` OPTIONS ( model_type = 'LINEAR_REG', input_label_cols = ['target'] ) AS SELECT feature1, feature2, target FROM `project.dataset.training_data`; -- Logistic regression model CREATE OR REPLACE MODEL `project.dataset.classifier` OPTIONS ( model_type = 'LOGISTIC_REG', input_label_cols = ['label'] ) AS SELECT * FROM training_data;
7.2. Model Evaluation and Prediction
-- Model evaluation SELECT * FROM ML.EVALUATE(MODEL `project.dataset.model`); -- Prediction SELECT * FROM ML.PREDICT( MODEL `project.dataset.model`, (SELECT * FROM `project.dataset.new_data`) );
8. External Data Sources
8.1. External Tables
-- Reference Cloud Storage CSV as external table CREATE EXTERNAL TABLE `project.dataset.external_table` OPTIONS ( format = 'CSV', uris = ['gs://bucket/path/*.csv'], skip_leading_rows = 1 ); -- Parquet external table CREATE EXTERNAL TABLE `project.dataset.parquet_table` OPTIONS ( format = 'PARQUET', uris = ['gs://bucket/path/*.parquet'] );
8.2. Federated Query
-- Connect to Cloud SQL SELECT * FROM EXTERNAL_QUERY( 'projects/project/locations/us/connections/connection_id', 'SELECT * FROM mysql_table' );
9. Scheduled Queries
9.1. Configuration Example
-- Configure in Cloud Console or bq command -- Run daily at 2 AM bq query --use_legacy_sql=false \ --schedule='every 24 hours' \ --display_name='Daily aggregation' \ --destination_table='project:dataset.daily_summary' \ --replace \ 'SELECT DATE(created_at) as date, COUNT(*) as count FROM source GROUP BY 1'
10. Reference Links
- Official docs: https://cloud.google.com/bigquery/docs
- GoogleSQL reference: https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax
- Function reference: https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-all
- bq command reference: https://cloud.google.com/bigquery/docs/bq-command-line-tool
- BigQuery ML: https://cloud.google.com/bigquery/docs/bqml-introduction
- Pricing: https://cloud.google.com/bigquery/pricing