Vibeship-spawner-skills fpga-design

FPGA Design Skill

install

source · Clone the upstream repo

git clone https://github.com/vibeforge1111/vibeship-spawner-skills

manifest: hardware/fpga-design/skill.yaml

FPGA Design Skill

Hardware description and digital design

id: fpga-design name: FPGA Design category: hardware complexity: expert requires_skills:

embedded-systems

description: | Patterns for FPGA development including RTL design (Verilog/VHDL), timing closure, clock domain crossing, high-level synthesis, and verification. Covers both traditional HDL and modern HLS approaches.

patterns:

synchronizer_cdc: name: Clock Domain Crossing Synchronizer description: Two-flip-flop synchronizer for single-bit CDC critical: true pattern: | // Two-Flip-Flop Synchronizer for single-bit signals // Reduces metastability MTBF to acceptable levels

  module sync_2ff #(
      parameter STAGES = 2  // Minimum 2, use 3 for high-speed
  )(
      input  wire clk_dst,    // Destination clock
      input  wire rst_n,      // Active-low reset
      input  wire async_in,   // Asynchronous input (source domain)
      output wire sync_out    // Synchronized output (destination domain)
  );

      // Synchronizer chain
      (* ASYNC_REG = "TRUE" *)  // Xilinx: place FFs close together
      reg [STAGES-1:0] sync_chain;

      always @(posedge clk_dst or negedge rst_n) begin
          if (!rst_n)
              sync_chain <= {STAGES{1'b0}};
          else
              sync_chain <= {sync_chain[STAGES-2:0], async_in};
      end

      assign sync_out = sync_chain[STAGES-1];

  endmodule

  // Usage: Synchronize a pulse from fast to slow domain
  module pulse_sync (
      input  wire clk_src,
      input  wire clk_dst,
      input  wire rst_n,
      input  wire pulse_in,   // Single-cycle pulse in source domain
      output wire pulse_out   // Synchronized pulse in destination domain
  );

      // Convert pulse to level (toggle)
      reg src_toggle;
      always @(posedge clk_src or negedge rst_n) begin
          if (!rst_n)
              src_toggle <= 1'b0;
          else if (pulse_in)
              src_toggle <= ~src_toggle;
      end

      // Synchronize toggle to destination
      wire dst_toggle;
      sync_2ff sync_toggle (
          .clk_dst(clk_dst),
          .rst_n(rst_n),
          .async_in(src_toggle),
          .sync_out(dst_toggle)
      );

      // Edge detect in destination
      reg dst_toggle_d;
      always @(posedge clk_dst or negedge rst_n) begin
          if (!rst_n)
              dst_toggle_d <= 1'b0;
          else
              dst_toggle_d <= dst_toggle;
      end

      assign pulse_out = dst_toggle ^ dst_toggle_d;

  endmodule
why: "CDC without proper synchronization causes random failures (metastability)"

async_fifo: name: Asynchronous FIFO description: Multi-bit data transfer between clock domains critical: true pattern: | // Asynchronous FIFO for multi-bit CDC // Uses Gray code pointers to prevent metastability corruption

  module async_fifo #(
      parameter DATA_WIDTH = 8,
      parameter ADDR_WIDTH = 4  // Depth = 2^ADDR_WIDTH
  )(
      // Write port (source clock domain)
      input  wire                  wr_clk,
      input  wire                  wr_rst_n,
      input  wire                  wr_en,
      input  wire [DATA_WIDTH-1:0] wr_data,
      output wire                  full,

      // Read port (destination clock domain)
      input  wire                  rd_clk,
      input  wire                  rd_rst_n,
      input  wire                  rd_en,
      output wire [DATA_WIDTH-1:0] rd_data,
      output wire                  empty
  );

      localparam DEPTH = 1 << ADDR_WIDTH;

      // Memory
      reg [DATA_WIDTH-1:0] mem [0:DEPTH-1];

      // Pointers (binary and Gray code)
      reg [ADDR_WIDTH:0] wr_ptr_bin, wr_ptr_gray;
      reg [ADDR_WIDTH:0] rd_ptr_bin, rd_ptr_gray;

      // Synchronized pointers
      wire [ADDR_WIDTH:0] wr_ptr_gray_sync;
      wire [ADDR_WIDTH:0] rd_ptr_gray_sync;

      // Binary to Gray conversion
      function [ADDR_WIDTH:0] bin2gray(input [ADDR_WIDTH:0] bin);
          bin2gray = bin ^ (bin >> 1);
      endfunction

      // Gray to Binary conversion
      function [ADDR_WIDTH:0] gray2bin(input [ADDR_WIDTH:0] gray);
          integer i;
          begin
              gray2bin[ADDR_WIDTH] = gray[ADDR_WIDTH];
              for (i = ADDR_WIDTH-1; i >= 0; i = i-1)
                  gray2bin[i] = gray2bin[i+1] ^ gray[i];
          end
      endfunction

      // Write logic
      always @(posedge wr_clk or negedge wr_rst_n) begin
          if (!wr_rst_n) begin
              wr_ptr_bin <= 0;
              wr_ptr_gray <= 0;
          end else if (wr_en && !full) begin
              mem[wr_ptr_bin[ADDR_WIDTH-1:0]] <= wr_data;
              wr_ptr_bin <= wr_ptr_bin + 1;
              wr_ptr_gray <= bin2gray(wr_ptr_bin + 1);
          end
      end

      // Read logic
      always @(posedge rd_clk or negedge rd_rst_n) begin
          if (!rd_rst_n) begin
              rd_ptr_bin <= 0;
              rd_ptr_gray <= 0;
          end else if (rd_en && !empty) begin
              rd_ptr_bin <= rd_ptr_bin + 1;
              rd_ptr_gray <= bin2gray(rd_ptr_bin + 1);
          end
      end

      assign rd_data = mem[rd_ptr_bin[ADDR_WIDTH-1:0]];

      // Synchronize write pointer to read domain
      sync_2ff #(.STAGES(2)) sync_wr [ADDR_WIDTH:0] (
          .clk_dst(rd_clk),
          .rst_n(rd_rst_n),
          .async_in(wr_ptr_gray),
          .sync_out(wr_ptr_gray_sync)
      );

      // Synchronize read pointer to write domain
      sync_2ff #(.STAGES(2)) sync_rd [ADDR_WIDTH:0] (
          .clk_dst(wr_clk),
          .rst_n(wr_rst_n),
          .async_in(rd_ptr_gray),
          .sync_out(rd_ptr_gray_sync)
      );

      // Full: write pointer will catch up to read pointer
      // (MSB different, rest same in Gray code)
      assign full = (wr_ptr_gray == {~rd_ptr_gray_sync[ADDR_WIDTH:ADDR_WIDTH-1],
                                      rd_ptr_gray_sync[ADDR_WIDTH-2:0]});

      // Empty: pointers are equal
      assign empty = (rd_ptr_gray == wr_ptr_gray_sync);

  endmodule
why: "Multi-bit CDC requires FIFO with Gray code pointers for safe transfer"

fsm_design: name: Finite State Machine Design description: Safe and synthesizable FSM patterns pattern: | // One-Hot FSM with Safe State Encoding // Preferred for FPGA (uses flip-flops efficiently)

  module fsm_onehot #(
      parameter IDLE     = 4'b0001,
      parameter START    = 4'b0010,
      parameter PROCESS  = 4'b0100,
      parameter DONE     = 4'b1000
  )(
      input  wire clk,
      input  wire rst_n,
      input  wire start,
      input  wire data_valid,
      input  wire complete,
      output reg  busy,
      output reg  result_valid
  );

      (* fsm_encoding = "one_hot" *)  // Xilinx synthesis directive
      reg [3:0] state, next_state;

      // State register (sequential)
      always @(posedge clk or negedge rst_n) begin
          if (!rst_n)
              state <= IDLE;
          else
              state <= next_state;
      end

      // Next state logic (combinational)
      always @(*) begin
          // Default: stay in current state
          next_state = state;

          case (1'b1)  // One-hot case statement
              state[0]: begin  // IDLE
                  if (start)
                      next_state = START;
              end

              state[1]: begin  // START
                  if (data_valid)
                      next_state = PROCESS;
              end

              state[2]: begin  // PROCESS
                  if (complete)
                      next_state = DONE;
              end

              state[3]: begin  // DONE
                  next_state = IDLE;
              end

              default: begin  // Safety: recover from invalid state
                  next_state = IDLE;
              end
          endcase
      end

      // Output logic (registered for better timing)
      always @(posedge clk or negedge rst_n) begin
          if (!rst_n) begin
              busy <= 1'b0;
              result_valid <= 1'b0;
          end else begin
              busy <= (next_state != IDLE);
              result_valid <= (state == DONE);
          end
      end

  endmodule

  // Binary FSM (for resource-constrained designs)
  module fsm_binary (
      input  wire clk,
      input  wire rst_n,
      input  wire start,
      output reg [1:0] state
  );

      localparam [1:0]
          IDLE    = 2'b00,
          ACTIVE  = 2'b01,
          WAIT    = 2'b10,
          DONE    = 2'b11;

      reg [1:0] next_state;

      always @(posedge clk or negedge rst_n) begin
          if (!rst_n)
              state <= IDLE;
          else
              state <= next_state;
      end

      always @(*) begin
          next_state = state;
          case (state)
              IDLE:    if (start) next_state = ACTIVE;
              ACTIVE:  next_state = WAIT;
              WAIT:    next_state = DONE;
              DONE:    next_state = IDLE;
              default: next_state = IDLE;  // Safety catch
          endcase
      end

  endmodule
why: "Proper FSM design prevents latch inference and ensures safe synthesis"

pipeline_design: name: Pipeline Design Pattern description: Multi-stage pipeline for high throughput pattern: | // Pipeline with Valid/Ready Handshaking // Maintains throughput while allowing backpressure

  module pipeline_stage #(
      parameter DATA_WIDTH = 32
  )(
      input  wire                  clk,
      input  wire                  rst_n,

      // Input interface
      input  wire                  in_valid,
      output wire                  in_ready,
      input  wire [DATA_WIDTH-1:0] in_data,

      // Output interface
      output reg                   out_valid,
      input  wire                  out_ready,
      output reg  [DATA_WIDTH-1:0] out_data
  );

      // Bubble insertion: accept new data when output is ready
      // or when we have no valid data
      assign in_ready = out_ready || !out_valid;

      always @(posedge clk or negedge rst_n) begin
          if (!rst_n) begin
              out_valid <= 1'b0;
              out_data <= {DATA_WIDTH{1'b0}};
          end else begin
              if (in_ready) begin
                  out_valid <= in_valid;
                  if (in_valid) begin
                      // Insert your processing logic here
                      out_data <= in_data;  // Pass-through example
                  end
              end
          end
      end

  endmodule

  // Multi-stage pipeline instantiation
  module data_pipeline #(
      parameter DATA_WIDTH = 32,
      parameter NUM_STAGES = 4
  )(
      input  wire                  clk,
      input  wire                  rst_n,
      input  wire                  in_valid,
      output wire                  in_ready,
      input  wire [DATA_WIDTH-1:0] in_data,
      output wire                  out_valid,
      input  wire                  out_ready,
      output wire [DATA_WIDTH-1:0] out_data
  );

      wire [NUM_STAGES:0] stage_valid;
      wire [NUM_STAGES:0] stage_ready;
      wire [DATA_WIDTH-1:0] stage_data [0:NUM_STAGES];

      assign stage_valid[0] = in_valid;
      assign in_ready = stage_ready[0];
      assign stage_data[0] = in_data;

      genvar i;
      generate
          for (i = 0; i < NUM_STAGES; i = i + 1) begin : gen_stages
              pipeline_stage #(
                  .DATA_WIDTH(DATA_WIDTH)
              ) stage (
                  .clk(clk),
                  .rst_n(rst_n),
                  .in_valid(stage_valid[i]),
                  .in_ready(stage_ready[i]),
                  .in_data(stage_data[i]),
                  .out_valid(stage_valid[i+1]),
                  .out_ready(stage_ready[i+1]),
                  .out_data(stage_data[i+1])
              );
          end
      endgenerate

      assign out_valid = stage_valid[NUM_STAGES];
      assign stage_ready[NUM_STAGES] = out_ready;
      assign out_data = stage_data[NUM_STAGES];

  endmodule
why: "Pipelining increases throughput and helps meet timing constraints"

memory_interface: name: Memory Interface Patterns description: BRAM and external memory interfaces pattern: | // Synchronous Block RAM (BRAM) - True Dual-Port // Xilinx/Intel will infer BRAM from this pattern

  module true_dual_port_ram #(
      parameter DATA_WIDTH = 32,
      parameter ADDR_WIDTH = 10  // 1024 words
  )(
      // Port A
      input  wire                  clk_a,
      input  wire                  en_a,
      input  wire                  we_a,
      input  wire [ADDR_WIDTH-1:0] addr_a,
      input  wire [DATA_WIDTH-1:0] din_a,
      output reg  [DATA_WIDTH-1:0] dout_a,

      // Port B
      input  wire                  clk_b,
      input  wire                  en_b,
      input  wire                  we_b,
      input  wire [ADDR_WIDTH-1:0] addr_b,
      input  wire [DATA_WIDTH-1:0] din_b,
      output reg  [DATA_WIDTH-1:0] dout_b
  );

      localparam DEPTH = 1 << ADDR_WIDTH;

      // RAM storage
      (* ram_style = "block" *)  // Force BRAM inference
      reg [DATA_WIDTH-1:0] ram [0:DEPTH-1];

      // Port A
      always @(posedge clk_a) begin
          if (en_a) begin
              if (we_a)
                  ram[addr_a] <= din_a;
              dout_a <= ram[addr_a];  // Read-first mode
          end
      end

      // Port B
      always @(posedge clk_b) begin
          if (en_b) begin
              if (we_b)
                  ram[addr_b] <= din_b;
              dout_b <= ram[addr_b];
          end
      end

  endmodule

  // AXI-Stream Interface (for data streaming)
  module axis_register_slice #(
      parameter DATA_WIDTH = 32
  )(
      input  wire                  aclk,
      input  wire                  aresetn,

      // Slave interface (input)
      input  wire                  s_axis_tvalid,
      output wire                  s_axis_tready,
      input  wire [DATA_WIDTH-1:0] s_axis_tdata,
      input  wire                  s_axis_tlast,

      // Master interface (output)
      output reg                   m_axis_tvalid,
      input  wire                  m_axis_tready,
      output reg  [DATA_WIDTH-1:0] m_axis_tdata,
      output reg                   m_axis_tlast
  );

      assign s_axis_tready = m_axis_tready || !m_axis_tvalid;

      always @(posedge aclk or negedge aresetn) begin
          if (!aresetn) begin
              m_axis_tvalid <= 1'b0;
              m_axis_tdata <= {DATA_WIDTH{1'b0}};
              m_axis_tlast <= 1'b0;
          end else if (s_axis_tready) begin
              m_axis_tvalid <= s_axis_tvalid;
              m_axis_tdata <= s_axis_tdata;
              m_axis_tlast <= s_axis_tlast;
          end
      end

  endmodule
why: "Proper memory patterns ensure efficient BRAM utilization"

timing_constraints: name: Timing Constraints description: SDC/XDC timing constraint patterns pattern: | # Xilinx XDC Timing Constraints

  # Primary clock definition
  create_clock -period 10.000 -name sys_clk [get_ports clk_100mhz]

  # Generated clocks (from PLL/MMCM)
  create_generated_clock -name clk_200mhz \
      -source [get_pins pll_inst/CLKIN1] \
      -multiply_by 2 \
      [get_pins pll_inst/CLKOUT0]

  # Input delay constraints
  # Data arrives 2ns after clock edge, with 0.5ns uncertainty
  set_input_delay -clock sys_clk -max 2.5 [get_ports data_in[*]]
  set_input_delay -clock sys_clk -min 2.0 [get_ports data_in[*]]

  # Output delay constraints
  set_output_delay -clock sys_clk -max 3.0 [get_ports data_out[*]]
  set_output_delay -clock sys_clk -min 0.5 [get_ports data_out[*]]

  # Clock domain crossing - set false path for synchronizers
  # (Timing is handled by synchronizer, not place-and-route)
  set_false_path -from [get_clocks clk_a] -to [get_cells -hier -filter {ASYNC_REG==TRUE}]

  # Or explicitly between clock domains
  set_clock_groups -asynchronous \
      -group [get_clocks clk_a] \
      -group [get_clocks clk_b]

  # Max delay for CDC paths (optional, for monitoring)
  set_max_delay -datapath_only -from [get_clocks clk_a] \
      -to [get_clocks clk_b] 5.0

  # Multicycle path (for pipelined logic)
  # Allow 2 clock cycles for this path
  set_multicycle_path 2 -setup -from [get_pins slow_reg/Q] \
      -to [get_pins result_reg/D]
  set_multicycle_path 1 -hold -from [get_pins slow_reg/Q] \
      -to [get_pins result_reg/D]

  # False paths for static configuration
  set_false_path -from [get_ports config_*]

  # Pin locations (IO constraints)
  set_property PACKAGE_PIN Y9 [get_ports clk_100mhz]
  set_property IOSTANDARD LVCMOS33 [get_ports clk_100mhz]
why: "Correct timing constraints are essential for reliable synthesis"

anti_patterns:

combinational_loop: name: Combinational Logic Loop problem: "Feedback without register causes oscillation/undefined behavior" solution: "Break loops with registers; check synthesis warnings"

latch_inference: name: Unintentional Latch Inference problem: "Incomplete case/if statements create latches" solution: "Assign default values at start of always block"

async_reset_release: name: Asynchronous Reset Release problem: "Releasing reset asynchronously can cause metastability" solution: "Use synchronous de-assertion: async assert, sync release"

multi_driver: name: Multiple Drivers on Signal problem: "Signal driven from multiple always blocks" solution: "Single driver per signal; use case/if for muxing"

handoffs:

to: embedded-systems when: "FPGA interfaces with MCU" pass: "Interface protocol, timing, register map"
to: control-systems when: "Implementing control algorithm in FPGA" pass: "Sample rate, fixed-point format, latency requirements"

ecosystem: vendors: - "Xilinx/AMD - Vivado, Vitis" - "Intel/Altera - Quartus" - "Lattice - Diamond, Radiant" - "Microchip - Libero"

tools: - "Vivado - Xilinx synthesis/implementation" - "Quartus - Intel synthesis" - "Yosys - Open-source synthesis" - "Verilator - Fast simulation" - "cocotb - Python testbenches" - "GHDL - VHDL simulation"

hls: - "Vitis HLS - Xilinx High-Level Synthesis" - "Intel HLS Compiler" - "Catapult HLS - Siemens"