From eb1a8df9351b50a2845dafd925d58f3a1fa8941a Mon Sep 17 00:00:00 2001 From: h3n4l Date: Tue, 26 Aug 2025 14:40:22 +0800 Subject: [PATCH 1/9] feat: initialize fuzz --- .gitignore | 5 +- go.mod | 1 + go.sum | 2 + tools/fuzzing/DESIGN.md | 536 ++++++++++++------ tools/fuzzing/Makefile | 59 ++ tools/fuzzing/README.md | 124 ++++ tools/fuzzing/cmd/fuzzer/main.go | 108 ++++ tools/fuzzing/internal/config/config.go | 93 +++ tools/fuzzing/internal/generator/generator.go | 217 +++++++ tools/fuzzing/internal/grammar/discovery.go | 187 ++++++ tools/fuzzing/internal/grammar/parser.go | 259 +++++++++ 11 files changed, 1413 insertions(+), 178 deletions(-) create mode 100644 tools/fuzzing/Makefile create mode 100644 tools/fuzzing/README.md create mode 100644 tools/fuzzing/cmd/fuzzer/main.go create mode 100644 tools/fuzzing/internal/config/config.go create mode 100644 tools/fuzzing/internal/generator/generator.go create mode 100644 tools/fuzzing/internal/grammar/discovery.go create mode 100644 tools/fuzzing/internal/grammar/parser.go diff --git a/.gitignore b/.gitignore index d262e9c..a5a239e 100644 --- a/.gitignore +++ b/.gitignore @@ -44,4 +44,7 @@ go.work.sum # node_modules **/node_modules/ -**/*.class \ No newline at end of file +**/*.class + +# No binary files +**/bin/** \ No newline at end of file diff --git a/go.mod b/go.mod index ff4a681..9e2614c 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( require ( github.com/davecgh/go-spew v1.1.1 // indirect + github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index c96f3a5..7367760 100644 --- a/go.sum +++ b/go.sum @@ -2,6 +2,8 @@ github.com/bytebase/antlr/v4 v4.0.0-20240827034948-8c385f108920 h1:IfmPt5o5R70NK github.com/bytebase/antlr/v4 v4.0.0-20240827034948-8c385f108920/go.mod h1:ykhjIPiv0IWpu3OGXCHdz2eUSe8UNGGD6baqjs8jSuU= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= diff --git a/tools/fuzzing/DESIGN.md b/tools/fuzzing/DESIGN.md index d5fc9c0..e78484f 100644 --- a/tools/fuzzing/DESIGN.md +++ b/tools/fuzzing/DESIGN.md @@ -2,229 +2,411 @@ ## Overview -A fuzzing tool that generates valid SQL inputs by analyzing ANTLR v4 grammar files, ensuring comprehensive parser testing with syntactically correct queries that can stress-test parsing performance and correctness. +A simple fuzzing tool that generates SQL inputs from ANTLR grammar rules to test parser performance on specific constructs. -## Goals +## Core Problems & Solutions -- **Valid Input Generation**: Generate syntactically correct SQL queries based on grammar rules -- **Performance Testing**: Create complex queries to test parser performance limits -- **Coverage Maximization**: Exercise all grammar rules and edge cases -- **Automated Testing**: Integrate with CI for continuous parser validation +### 1. Target Specific Rules +**Problem**: Performance issues often occur in specific rules (e.g., `createProcedureStatement`) +**Solution**: Allow users to specify starting rule chains -## Architecture +```bash +./fuzzer --grammar postgresql --start-rule createProcedureStatement --count 100 +./fuzzer --grammar cql --start-rule selectStatement.whereClause --count 50 +``` + +### 2. Recursion Control +**Problem**: Grammar rules can be recursive, causing infinite loops during generation +**Solution**: Limit recursion depth per rule (proven to handle all ANTLR recursion types) +#### ANTLR 4 Recursion Types + +**Direct Left Recursion:** +```antlr +expr: expr '+' expr | INT // expr directly refers to itself on left ``` -tools/fuzzing/ -├── generator/ # Core generation logic -│ ├── grammar_analyzer.go # Parse ANTLR grammar files -│ ├── rule_expander.go # Expand grammar rules to concrete syntax -│ └── query_builder.go # Build SQL queries from rule expansions -├── strategies/ # Different generation strategies -│ ├── depth_first.go # Generate deeply nested structures -│ ├── breadth_first.go # Generate wide, complex queries -│ └── weighted.go # Probability-based rule selection -├── corpus/ # Generated test cases and seeds -│ ├── seeds/ # Hand-crafted seed inputs -│ └── generated/ # Auto-generated test cases -└── cmd/ # CLI tools - └── fuzzer/ # Main fuzzer executable -``` - -## Core Components - -### 1. Grammar Analyzer - -Leverages the existing `tools/grammar/` ANTLR v4 parser to: -- Parse target grammar files (e.g., `postgresql.g4`, `cql.g4`) -- Extract production rules and their alternatives -- Build dependency graph between rules -- Identify terminal vs non-terminal symbols + +**Direct Right Recursion:** +```antlr +expr: INT '+' expr | INT // expr directly refers to itself on right +``` + +**Indirect Recursion (Non-Left):** +```antlr +selectStmt: SELECT columns fromClause whereClause? +whereClause: WHERE expr +expr: '(' selectStmt ')' | INT // Indirect: expr -> selectStmt -> whereClause -> expr +``` +*Note: ANTLR 4 does NOT support mutually left recursive grammars. This example is valid because the recursion is not left-recursive (selectStmt doesn't start with selectStmt).* + +**Self-Recursion with Alternatives:** +```antlr +stmt: ifStmt | whileStmt | blockStmt +blockStmt: '{' stmt* '}' // blockStmt contains multiple stmt references +``` + +#### Why Depth Control Works + +**Theorem**: Any grammar rule expansion terminates in finite steps with depth limiting. + +**Proof by Contradiction:** +1. Assume infinite expansion despite depth limit `D` +2. Each recursive call increases depth: `depth(rule_n) = depth(rule_{n-1}) + 1` +3. When `depth ≥ D`, generator forces terminal selection +4. Therefore, maximum expansion depth is bounded by `D` +5. Since each rule has finite alternatives and finite elements, total expansion is finite ∎ + +#### Depth Control Implementation ```go -type GrammarAnalyzer struct { - parser *grammar.ANTLRv4Parser - rules map[string]*Rule +func (g *Generator) GenerateFromRule(ruleName string, currentDepth int) string { + // Base case: exceed depth limit -> force terminal + if currentDepth >= g.maxDepth { + return g.forceTerminal(ruleName) + } + + rule := g.grammar.GetRule(ruleName) + + // Prefer non-recursive alternatives as depth increases + alternative := g.selectAlternativeWithDepthBias(rule, currentDepth) + + result := "" + for _, element := range alternative { + if element.IsRule() { + // Recursive call with incremented depth + result += g.GenerateFromRule(element.Name, currentDepth+1) + } else { + result += element.Literal + } + } + return result } -type Rule struct { - Name string - Alternatives []Alternative - Type RuleType // LEXER, PARSER, FRAGMENT +func (g *Generator) forceTerminal(ruleName string) string { + rule := g.grammar.GetRule(ruleName) + + // Find non-recursive alternatives (containing only terminals) + for _, alt := range rule.Alternatives { + if !alt.ContainsRecursion() { + return g.expandAlternative(alt, g.maxDepth) + } + } + + // Fallback: use default terminal for this rule type + return g.getDefaultTerminal(ruleName) } ``` -### 2. Rule Expander +#### Examples with Depth Control + +```bash +./fuzzer --start-rule expr --max-depth 3 --count 5 +``` + +**Generated sequences:** +- Depth 0: `INT` (terminal) +- Depth 1: `INT + INT` +- Depth 2: `(INT + INT) + INT` +- Depth 3: `((INT + INT) + INT) + INT` (max depth reached) -Recursively expands grammar rules into concrete syntax trees: -- Handles rule recursion with configurable depth limits -- Supports probability-weighted alternative selection -- Manages lexer rules and literal generation -- Tracks generation context for smart decisions +**Complex mutual recursion:** +```bash +./fuzzer --start-rule selectStmt --max-depth 4 --count 3 +``` -```go -type RuleExpander struct { - grammar *ParsedGrammar - maxDepth int - weights map[string]float64 - random *rand.Rand -} +**Expansion trace:** +``` +selectStmt (depth=0) +├── SELECT columns FROM table whereClause (depth=0) + └── whereClause (depth=1) + └── WHERE expr (depth=1) + └── '(' selectStmt ')' (depth=2) + └── selectStmt (depth=2) + └── SELECT columns FROM table (depth=2, no whereClause to avoid depth=4) ``` -### 3. Query Builder +#### Depth Strategy Options -Converts syntax trees to executable SQL strings: -- Handles whitespace and formatting -- Manages identifier generation (table names, columns) -- Ensures semantic consistency where possible -- Outputs parseable query strings +**Conservative (Early Termination):** +- Lower max depth (3-5) +- Bias toward terminals as depth increases +- Prevents deep nesting, faster generation -## Generation Strategies +**Aggressive (Deep Testing):** +- Higher max depth (10-15) +- Equal probability until max depth +- Tests parser limits, slower generation -### Depth-First Strategy -- Generates deeply nested subqueries, expressions -- Tests parser stack limits and recursion handling -- Focuses on structural complexity +```bash +# Conservative - quick, shallow testing +./fuzzer --start-rule expr --max-depth 3 --depth-strategy conservative -### Breadth-First Strategy -- Creates wide queries with many clauses, joins, columns -- Tests parser memory usage and performance -- Focuses on query size and breadth +# Aggressive - deep parser stress testing +./fuzzer --start-rule createProcedureStmt --max-depth 12 --depth-strategy aggressive +``` -### Weighted Strategy -- Uses probability weights for rule selection -- Biases toward commonly used constructs -- Configurable via weight files per dialect +### 3. Optional Rule Probability +**Problem**: Optional rules (`selectStmt: SELECT columns FROM table whereClause?`) need probability control +**Solution**: Configure probability for optional elements (standard in grammar-based fuzzing) -## Integration Points +### 4. Quantified Rule Generation +**Problem**: Quantified rules (`stmt*`, `expr+`, `column{1,5}`) need count control +**Solution**: Configure generation counts for quantified elements -### With Existing Grammar Parser -```go -// Reuse tools/grammar/ for parsing target grammars -analyzer := NewGrammarAnalyzer() -targetGrammar, err := analyzer.ParseGrammarFile("postgresql/PostgreSQLLexer.g4") +#### ANTLR 4 Quantifier Types + +**Zero or More (`rule*`):** +```antlr +blockStmt: '{' stmt* '}' // Generate 0 to N statements +selectList: column (',' column)* // Generate 1 to N columns +``` + +**One or More (`rule+`):** +```antlr +identifier: LETTER (LETTER | DIGIT)+ // Generate 1 to N characters +``` + +**Exact Count (`rule{n}`):** +```antlr +hexDigit: HEX_DIGIT{4} // Generate exactly 4 hex digits +``` + +**Range Count (`rule{min,max}`):** +```antlr +varchar: CHAR{1,255} // Generate 1 to 255 characters +``` + +#### Quantifier Control Strategy + +**Count Distribution Options:** +- **Uniform**: Equal probability for each count in range +- **Exponential**: Higher probability for lower counts (realistic) +- **Fixed**: Always generate specific count + +```bash +# Basic usage - user specifies max count +./fuzzer --start-rule blockStmt --max-quantifier 10 --count 100 + +# User controls both min and max for quantifiers +./fuzzer --start-rule selectList --min-quantifier 1 --max-quantifier 5 --count 50 + +# Fixed count for performance testing +./fuzzer --start-rule selectStmt --quantifier-count 100 --count 10 ``` -### With Parser Testing +#### Implementation Logic + ```go -// Generate test cases for specific parser -fuzzer := NewFuzzer(postgresqlGrammar) -queries := fuzzer.GenerateQueries(1000) - -for _, query := range queries { - // Test against postgresql parser - result := postgresqlParser.Parse(query) - // Collect metrics, detect crashes +type QuantifierConfig struct { + Strategy string // "uniform", "exponential", "fixed" + MinRepeat int // Minimum repetitions (overrides grammar min) + MaxRepeat int // Maximum repetitions (overrides grammar max) + FixedCount int // Fixed count for "fixed" strategy +} + +func (g *Generator) generateQuantified(element *GrammarElement, config QuantifierConfig) string { + var count int + + switch element.Quantifier { + case "*": // Zero or more + min := max(0, config.MinRepeat) + max := min(config.MaxRepeat, 50) // Reasonable default limit + count = g.selectCount(min, max, config.Strategy) + + case "+": // One or more + min := max(1, config.MinRepeat) + max := min(config.MaxRepeat, 50) + count = g.selectCount(min, max, config.Strategy) + + case "{n}": // Exact count + if config.Strategy == "fixed" { + count = config.FixedCount + } else { + count = element.ExactCount + } + + case "{min,max}": // Range + min := max(element.MinCount, config.MinRepeat) + max := min(element.MaxCount, config.MaxRepeat) + count = g.selectCount(min, max, config.Strategy) + } + + result := "" + for i := 0; i < count; i++ { + if element.IsRule() { + result += g.GenerateFromRule(element.RuleName, g.currentDepth+1) + } else { + result += element.Literal + } + + // Add separators for lists (e.g., comma-separated) + if i < count-1 && element.HasSeparator() { + result += element.Separator + } + } + return result +} + +func (g *Generator) selectCount(min, max int, strategy string) int { + if min > max { + return min + } + + switch strategy { + case "fixed": + return min // Use minimum as fixed value + + case "uniform": + return min + g.random.Intn(max-min+1) + + case "exponential": + // Exponential decay: higher probability for lower counts + range_size := max - min + 1 + // Generate exponentially distributed number, then map to range + lambda := 2.0 / float64(range_size) + exp_val := g.random.ExpFloat64() / lambda + count := min + int(exp_val) + if count > max { + count = max + } + return count + + default: + return min + g.random.Intn(max-min+1) + } } ``` -## Configuration - -### Fuzzer Config -```yaml -target_grammar: "postgresql" -strategies: - - name: "depth_first" - weight: 0.3 - max_depth: 15 - - name: "breadth_first" - weight: 0.4 - max_width: 50 - - name: "weighted" - weight: 0.3 - weights_file: "postgresql_weights.yaml" - -generation: - count: 10000 - max_query_length: 100000 - seed: 42 - -output: - format: "sql" - directory: "corpus/generated" -``` - -### Grammar Weights -```yaml -# postgresql_weights.yaml -rules: - selectStmt: 0.4 - insertStmt: 0.2 - updateStmt: 0.2 - deleteStmt: 0.1 - createStmt: 0.1 - - # Bias toward complex expressions - expr: - binaryOp: 0.4 - functionCall: 0.3 - subquery: 0.2 - literal: 0.1 +#### Examples with Quantifier Control + +**Block statement with multiple statements:** +```bash +./fuzzer --start-rule blockStmt --quantifier-strategy exponential --max-repeat 8 ``` +**Generated:** +- 70% chance: `{ stmt; }` (1 statement) +- 20% chance: `{ stmt; stmt; }` (2 statements) +- 7% chance: `{ stmt; stmt; stmt; }` (3 statements) +- 3% chance: 4+ statements + +**Column list generation:** +```bash +./fuzzer --start-rule selectList --quantifier-strategy uniform --min-repeat 3 --max-repeat 7 +``` +**Generated:** +- Equal probability: `col1, col2, col3` to `col1, col2, col3, col4, col5, col6, col7` -## CLI Interface +**Performance testing with large lists:** +```bash +./fuzzer --start-rule selectStmt --quantifier-count 100 --count 5 +``` +**Generated:** +- Always generates exactly 100 columns to test parser performance on large SELECT lists +**Simple user control:** ```bash -# Generate queries for PostgreSQL -./fuzzer generate --grammar postgresql --count 1000 --strategy weighted +./fuzzer --start-rule blockStmt --max-quantifier 3 --count 10 +``` +**Generated:** +- `stmt*` generates 0-3 statements +- `expr+` generates 1-3 expressions +- User controls maximum without complex strategy options -# Run continuous fuzzing with performance metrics -./fuzzer fuzz --grammar cql --duration 1h --metrics +```bash +./fuzzer --start-rule selectStmt --optional-prob 0.7 --count 100 +# 70% chance to include optional whereClause +``` + +## Simple Architecture -# Validate existing corpus against parser -./fuzzer validate --grammar postgresql --corpus corpus/postgresql/ +``` +tools/fuzzing/ +├── main.go # CLI entry point +├── generator.go # Core generation logic +└── grammar_parser.go # Reuse tools/grammar/ ``` -## Performance Metrics +## Core Logic -### Generation Metrics -- Queries generated per second -- Grammar rule coverage percentage -- Distribution of query complexity (depth, width) +```go +type Generator struct { + grammar *ParsedGrammar + maxDepth int + optionalProb float64 + random *rand.Rand +} -### Parser Testing Metrics -- Parse success rate -- Average parse time per query -- Memory usage during parsing -- Parser crash/error detection +func (g *Generator) GenerateFromRule(ruleName string, currentDepth int) string { + if currentDepth > g.maxDepth { + return g.generateTerminal() // Stop recursion + } + + rule := g.grammar.GetRule(ruleName) + alternative := g.selectAlternative(rule) + + result := "" + for _, element := range alternative { + if element.IsOptional() && g.random.Float64() > g.optionalProb { + continue // Skip optional element + } + if element.IsRule() { + result += g.GenerateFromRule(element.Name, currentDepth+1) + } else { + result += element.Literal + } + } + return result +} +``` + +## CLI Interface + +```bash +# Basic usage - generate from specific rule +./fuzzer --grammar postgresql --start-rule selectStmt --count 10 + +# Control recursion depth +./fuzzer --grammar cql --start-rule expr --max-depth 3 --count 5 -## Implementation Phases +# Control optional probability +./fuzzer --grammar postgresql --start-rule createStmt --optional-prob 0.8 --count 10 -### Phase 1: Foundation (Week 1-2) -- Basic grammar analyzer using existing ANTLR parser -- Simple rule expander with depth-first strategy -- Command-line interface for manual testing +# Control quantifier max count (for rule*, rule+) +./fuzzer --grammar postgresql --start-rule blockStmt --max-quantifier 8 --count 20 -### Phase 2: Core Features (Week 3-4) -- Multiple generation strategies -- Configuration system -- Basic corpus management -- Integration with existing parser tests +# Control all parameters together +./fuzzer --grammar cql --start-rule selectStmt \ + --max-depth 5 \ + --optional-prob 0.7 \ + --max-quantifier 10 \ + --count 50 + +# Output to file +./fuzzer --grammar postgresql --start-rule selectStmt --count 100 --output queries.sql +``` -### Phase 3: Advanced Features (Week 5-6) -- Weighted generation with probability tuning -- Performance metrics collection -- CI integration for continuous fuzzing -- Corpus minimization and deduplication +## Implementation Steps -### Phase 4: Optimization (Week 7-8) -- Generation performance optimization -- Advanced semantic awareness -- Custom mutation strategies -- Comprehensive documentation +### Step 1: Basic Generator +- Parse grammar using existing `tools/grammar/` +- Simple rule expansion with depth limit +- CLI with `--start-rule`, `--max-depth`, `--count` -## Future Enhancements +### Step 2: Optional Control +- Add `--optional-prob` flag +- Detect optional elements in grammar rules +- Apply probability during generation -- **Semantic Awareness**: Generate queries with valid schema references -- **Mutation-Based Fuzzing**: Mutate existing queries to explore edge cases -- **Differential Testing**: Compare parser outputs across database dialects -- **Performance Regression Detection**: Track parser performance over time -- **Grammar Evolution**: Adapt fuzzing as grammars evolve +### Step 3: Integration +- Test generated queries against parsers +- Add basic performance timing +- CI integration for regression testing -## Dependencies +## Common Fuzzing Techniques Used -- Existing `tools/grammar/` ANTLR v4 parser -- Go standard library (`rand`, `fmt`, `strings`) -- YAML configuration parsing -- CLI framework (e.g., `cobra`) +1. **Grammar-based generation** - Generate from formal grammar rules +2. **Depth limiting** - Prevent infinite recursion in recursive grammars +3. **Probability-based selection** - Control optional rule inclusion +4. **Targeted fuzzing** - Focus on specific rule paths instead of full grammar -This design provides a solid foundation for grammar-aware fuzzing while leveraging our existing ANTLR infrastructure. \ No newline at end of file +This approach is much simpler but addresses your specific needs for testing parser performance on particular constructs. \ No newline at end of file diff --git a/tools/fuzzing/Makefile b/tools/fuzzing/Makefile new file mode 100644 index 0000000..20503f2 --- /dev/null +++ b/tools/fuzzing/Makefile @@ -0,0 +1,59 @@ +BINARY_NAME=fuzzer +BUILD_DIR=bin +CMD_PATH=github.com/bytebase/parser/tools/fuzzing/cmd/fuzzer + +.PHONY: all build test clean run help + +all: build test + +# Build the binary +build: + @echo "Building $(BINARY_NAME)..." + @mkdir -p $(BUILD_DIR) + go build -o $(BUILD_DIR)/$(BINARY_NAME) $(CMD_PATH) + +# Run tests +test: + @echo "Running tests..." + go test -v github.com/bytebase/parser/tools/fuzzing/... + +# Clean build artifacts +clean: + @echo "Cleaning..." + rm -rf $(BUILD_DIR) + go clean + +# Run the fuzzer (requires arguments) +run: + go run $(CMD_PATH) $(ARGS) + +# Install dependencies +deps: + @echo "Installing dependencies..." + cd ../.. && go mod tidy && go mod download + +# Format code +fmt: + @echo "Formatting code..." + go fmt github.com/bytebase/parser/tools/fuzzing/... + +# Run linter +lint: + @echo "Running linter..." + golangci-lint run + +# Show help +help: + @echo "Available targets:" + @echo " build - Build the fuzzer binary" + @echo " test - Run all tests" + @echo " clean - Clean build artifacts" + @echo " run - Run the fuzzer (use ARGS='--grammar postgresql --start-rule selectStmt')" + @echo " deps - Install/update dependencies" + @echo " fmt - Format all Go code" + @echo " lint - Run golangci-lint" + @echo " help - Show this help message" + @echo "" + @echo "Examples:" + @echo " make run ARGS='--grammar postgresql/PostgreSQLLexer.g4,postgresql/PostgreSQLParser.g4 --start-rule selectStmt --count 5'" + @echo " make run ARGS='--help'" \ No newline at end of file diff --git a/tools/fuzzing/README.md b/tools/fuzzing/README.md new file mode 100644 index 0000000..64f4409 --- /dev/null +++ b/tools/fuzzing/README.md @@ -0,0 +1,124 @@ +# Grammar-Aware Fuzzing Tool + +A fuzzing tool that generates valid SQL inputs from ANTLR v4 grammar files for parser testing. + +## Quick Start + +```bash +# Build the fuzzer +make build + +# List available grammars +./bin/fuzzer --list-grammars + +# Single combined grammar file +./bin/fuzzer --grammar combined.g4 --start-rule selectStmt --count 10 + +# Separate lexer and parser files +./bin/fuzzer --grammar postgresql/PostgreSQLLexer.g4,postgresql/PostgreSQLParser.g4 --start-rule selectStmt --count 10 + +# Run with custom parameters +./bin/fuzzer --grammar cql/CqlLexer.g4,cql/CqlParser.g4 --start-rule expr --max-depth 3 --max-quantifier 8 --count 5 +``` + +## Project Structure + +``` +tools/fuzzing/ +├── cmd/fuzzer/ # CLI application entry point +│ └── main.go +├── internal/ # Private application packages +│ ├── config/ # Configuration management +│ └── generator/ # Core fuzzing logic +├── bin/ # Built binaries (created by make build) +├── Makefile # Build and development tasks +└── go.mod # Go module definition +``` + +## CLI Options + +| Flag | Description | Default | +|------|-------------|---------| +| `--grammar` | Grammar file(s): single file or comma-separated lexer,parser | - | +| `--start-rule` | Starting grammar rule (required) | - | +| `--count` | Number of queries to generate | 10 | +| `--max-depth` | Maximum recursion depth | 5 | +| `--optional-prob` | Probability of optional elements (0.0-1.0) | 0.5 | +| `--max-quantifier` | Maximum count for `*` and `+` rules | 5 | +| `--min-quantifier` | Minimum count override | 0 | +| `--quantifier-count` | Fixed count for all quantifiers | 0 | +| `--output` | Output file path | stdout | +| `--seed` | Random seed for reproducible results | current time | + +## Examples + +### Basic Usage +```bash +# Single combined grammar file +./bin/fuzzer --grammar combined.g4 --start-rule selectStmt --count 10 + +# Separate lexer and parser files +./bin/fuzzer --grammar postgresql/PostgreSQLLexer.g4,postgresql/PostgreSQLParser.g4 --start-rule selectStmt --count 10 + +# Generate CQL expressions with limited depth +./bin/fuzzer --grammar cql/CqlLexer.g4,cql/CqlParser.g4 --start-rule expr --max-depth 3 --count 5 +``` + +### Performance Testing +```bash +# Generate queries with exactly 100 columns +./bin/fuzzer --grammar postgresql/PostgreSQLLexer.g4,postgresql/PostgreSQLParser.g4 --start-rule selectStmt --quantifier-count 100 --count 5 + +# Generate deeply nested expressions +./bin/fuzzer --grammar cql/CqlLexer.g4,cql/CqlParser.g4 --start-rule expr --max-depth 15 --count 10 +``` + +### Output Control +```bash +# Save to file +./bin/fuzzer --grammar postgresql/PostgreSQLLexer.g4,postgresql/PostgreSQLParser.g4 --start-rule selectStmt --count 100 --output queries.sql + +# Reproducible generation +./bin/fuzzer --grammar cql/CqlLexer.g4,cql/CqlParser.g4 --start-rule expr --seed 42 --count 10 +``` + +## Development + +### Build Commands +```bash +# From tools/fuzzing directory +make build # Build binary to bin/fuzzer +make test # Run all tests +make clean # Clean build artifacts +make fmt # Format code +make deps # Install/update dependencies (runs from repo root) + +# From repository root +go build -o tools/fuzzing/bin/fuzzer github.com/bytebase/parser/tools/fuzzing/cmd/fuzzer +``` + +### Running During Development +```bash +# From tools/fuzzing directory +make run ARGS='--grammar postgresql/PostgreSQLLexer.g4,postgresql/PostgreSQLParser.g4 --start-rule selectStmt --count 5' +make run ARGS='--help' + +# From repository root +go run github.com/bytebase/parser/tools/fuzzing/cmd/fuzzer --grammar postgresql/PostgreSQLLexer.g4,postgresql/PostgreSQLParser.g4 --start-rule selectStmt --count 5 +``` + +## Monolithic Repository Structure + +This tool uses the single `go.mod` file at the repository root: +- **Module**: `github.com/bytebase/parser` +- **Import path**: `github.com/bytebase/parser/tools/fuzzing/...` +- **Dependencies**: Shared with other tools in the repository + +## Integration + +This tool is designed to integrate with: +- Existing ANTLR v4 grammar parser at `tools/grammar/` +- All parser implementations in the repository (postgresql, cql, redshift, etc.) +- Shared CI/CD pipeline and testing infrastructure + +**TODO**: Grammar parser integration and actual query generation logic. \ No newline at end of file diff --git a/tools/fuzzing/cmd/fuzzer/main.go b/tools/fuzzing/cmd/fuzzer/main.go new file mode 100644 index 0000000..dd62d7e --- /dev/null +++ b/tools/fuzzing/cmd/fuzzer/main.go @@ -0,0 +1,108 @@ +package main + +import ( + "flag" + "fmt" + "os" + "strings" + "time" + + "github.com/bytebase/parser/tools/fuzzing/internal/config" + "github.com/bytebase/parser/tools/fuzzing/internal/generator" + "github.com/bytebase/parser/tools/fuzzing/internal/grammar" +) + +func main() { + cfg := parseFlags() + + if err := cfg.Validate(); err != nil { + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } + + cfg.Print() + + gen := generator.New(cfg) + if err := gen.Generate(); err != nil { + fmt.Fprintf(os.Stderr, "Generation failed: %v\n", err) + os.Exit(1) + } + + fmt.Println("Generation completed successfully!") +} + +func parseFlags() *config.Config { + cfg := &config.Config{} + var listGrammars bool + var grammarArg string + + flag.StringVar(&grammarArg, "grammar", "", "Grammar file(s): single file or comma-separated lexer,parser files") + flag.StringVar(&cfg.StartRule, "start-rule", "", "Starting grammar rule name") + flag.IntVar(&cfg.Count, "count", 10, "Number of queries to generate") + flag.IntVar(&cfg.MaxDepth, "max-depth", 5, "Maximum recursion depth") + flag.Float64Var(&cfg.OptionalProb, "optional-prob", 0.5, "Probability of including optional elements (0.0-1.0)") + flag.IntVar(&cfg.MaxQuantifier, "max-quantifier", 5, "Maximum count for quantified rules (* and +)") + flag.IntVar(&cfg.MinQuantifier, "min-quantifier", 0, "Minimum count for quantified rules (overrides grammar)") + flag.IntVar(&cfg.QuantifierCount, "quantifier-count", 0, "Fixed count for all quantifiers (overrides min/max)") + flag.StringVar(&cfg.Output, "output", "", "Output file path (default: stdout)") + flag.Int64Var(&cfg.Seed, "seed", time.Now().UnixNano(), "Random seed for reproducible generation") + flag.BoolVar(&listGrammars, "list-grammars", false, "List all available grammars and exit") + + // Custom usage message + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Grammar-Aware Fuzzing Tool\n\n") + fmt.Fprintf(os.Stderr, "Usage: %s [options]\n\n", os.Args[0]) + fmt.Fprintf(os.Stderr, "Options:\n") + flag.PrintDefaults() + + fmt.Fprintf(os.Stderr, "\nExamples:\n") + fmt.Fprintf(os.Stderr, " # List available grammars\n") + fmt.Fprintf(os.Stderr, " %s --list-grammars\n\n", os.Args[0]) + fmt.Fprintf(os.Stderr, " # Single combined grammar file\n") + fmt.Fprintf(os.Stderr, " %s --grammar combined.g4 --start-rule selectStmt --count 10\n\n", os.Args[0]) + fmt.Fprintf(os.Stderr, " # Separate lexer and parser files\n") + fmt.Fprintf(os.Stderr, " %s --grammar postgresql/PostgreSQLLexer.g4,postgresql/PostgreSQLParser.g4 --start-rule selectStmt --count 10\n\n", os.Args[0]) + fmt.Fprintf(os.Stderr, " # Control recursion and quantifiers\n") + fmt.Fprintf(os.Stderr, " %s --grammar cql/CqlLexer.g4,cql/CqlParser.g4 --start-rule expr --max-depth 3 --max-quantifier 8 --count 5\n\n", os.Args[0]) + fmt.Fprintf(os.Stderr, " # Performance testing\n") + fmt.Fprintf(os.Stderr, " %s --grammar redshift/RedshiftLexer.g4,redshift/RedshiftParser.g4 --start-rule blockStmt --quantifier-count 100 --count 10\n\n", os.Args[0]) + fmt.Fprintf(os.Stderr, " # Output to file\n") + fmt.Fprintf(os.Stderr, " %s --grammar postgresql/PostgreSQLLexer.g4,postgresql/PostgreSQLParser.g4 --start-rule selectStmt --count 100 --output queries.sql\n\n", os.Args[0]) + } + + flag.Parse() + + // Handle --list-grammars + if listGrammars { + grammars, err := grammar.ListAvailableGrammars() + if err != nil { + fmt.Fprintf(os.Stderr, "Error listing grammars: %v\n", err) + os.Exit(1) + } + + fmt.Println("Available grammars:") + for _, g := range grammars { + files, err := grammar.DiscoverGrammarFiles(g) + if err != nil { + fmt.Printf(" %s (error: %v)\n", g, err) + continue + } + fmt.Printf(" %s\n", g) + fmt.Printf(" Lexer: %s\n", files.LexerFile) + fmt.Printf(" Parser: %s\n", files.ParserFile) + } + os.Exit(0) + } + + // Parse grammar files from comma-separated argument + if grammarArg != "" { + files := strings.Split(grammarArg, ",") + // Trim whitespace from each file + for i, file := range files { + files[i] = strings.TrimSpace(file) + } + cfg.GrammarFiles = files + } + + return cfg +} \ No newline at end of file diff --git a/tools/fuzzing/internal/config/config.go b/tools/fuzzing/internal/config/config.go new file mode 100644 index 0000000..d976e37 --- /dev/null +++ b/tools/fuzzing/internal/config/config.go @@ -0,0 +1,93 @@ +package config + +import ( + "fmt" + + "github.com/pkg/errors" +) + +// Config holds all configuration options for the fuzzer +type Config struct { + GrammarFiles []string // Can be one file (combined) or two files (lexer,parser) + StartRule string + Count int + MaxDepth int + OptionalProb float64 + MaxQuantifier int + MinQuantifier int + QuantifierCount int + Output string + Seed int64 +} + +// Validate checks if the configuration is valid +func (c *Config) Validate() error { + if len(c.GrammarFiles) == 0 { + return errors.New("--grammar is required") + } + + if len(c.GrammarFiles) > 2 { + return errors.New("--grammar accepts maximum 2 files (lexer,parser)") + } + + if c.StartRule == "" { + return errors.New("--start-rule is required") + } + + if c.Count <= 0 { + return errors.New("--count must be positive") + } + + if c.MaxDepth <= 0 { + return errors.New("--max-depth must be positive") + } + + if c.OptionalProb < 0.0 || c.OptionalProb > 1.0 { + return errors.New("--optional-prob must be between 0.0 and 1.0") + } + + if c.MaxQuantifier <= 0 { + return errors.New("--max-quantifier must be positive") + } + + if c.MinQuantifier < 0 { + return errors.New("--min-quantifier must be non-negative") + } + + if c.MinQuantifier > c.MaxQuantifier { + return errors.New("--min-quantifier cannot be greater than --max-quantifier") + } + + if c.QuantifierCount < 0 { + return errors.New("--quantifier-count must be non-negative") + } + + return nil +} + +// Print displays the configuration +func (c *Config) Print() { + fmt.Printf("Grammar-Aware Fuzzer\n") + if len(c.GrammarFiles) == 1 { + fmt.Printf("Grammar File: %s\n", c.GrammarFiles[0]) + } else if len(c.GrammarFiles) == 2 { + fmt.Printf("Lexer File: %s\n", c.GrammarFiles[0]) + fmt.Printf("Parser File: %s\n", c.GrammarFiles[1]) + } + fmt.Printf("Start Rule: %s\n", c.StartRule) + fmt.Printf("Count: %d\n", c.Count) + fmt.Printf("Max Depth: %d\n", c.MaxDepth) + fmt.Printf("Optional Probability: %.2f\n", c.OptionalProb) + fmt.Printf("Max Quantifier: %d\n", c.MaxQuantifier) + if c.MinQuantifier > 0 { + fmt.Printf("Min Quantifier: %d\n", c.MinQuantifier) + } + if c.QuantifierCount > 0 { + fmt.Printf("Fixed Quantifier Count: %d\n", c.QuantifierCount) + } + if c.Output != "" { + fmt.Printf("Output: %s\n", c.Output) + } + fmt.Printf("Seed: %d\n", c.Seed) + fmt.Println() +} \ No newline at end of file diff --git a/tools/fuzzing/internal/generator/generator.go b/tools/fuzzing/internal/generator/generator.go new file mode 100644 index 0000000..648cd77 --- /dev/null +++ b/tools/fuzzing/internal/generator/generator.go @@ -0,0 +1,217 @@ +package generator + +import ( + "fmt" + "math/rand" + + "github.com/bytebase/parser/tools/fuzzing/internal/config" + "github.com/bytebase/parser/tools/fuzzing/internal/grammar" + "github.com/pkg/errors" +) + +// Generator handles the fuzzing logic +type Generator struct { + config *config.Config + random *rand.Rand + grammars []*grammar.ParsedGrammar +} + +// New creates a new generator with the given configuration +func New(cfg *config.Config) *Generator { + return &Generator{ + config: cfg, + random: rand.New(rand.NewSource(cfg.Seed)), + } +} + +// Generate produces the specified number of queries +func (g *Generator) Generate() error { + fmt.Println("Initializing grammar parser...") + + // Parse all grammar files + g.grammars = make([]*grammar.ParsedGrammar, len(g.config.GrammarFiles)) + for i, filePath := range g.config.GrammarFiles { + parsedGrammar, err := grammar.ParseGrammarFile(filePath) + if err != nil { + return errors.Wrapf(err, "failed to parse grammar file %s", filePath) + } + g.grammars[i] = parsedGrammar + fmt.Printf("Parsed grammar file: %s\n", filePath) + } + + // Validate start rule exists + if !g.hasRule(g.config.StartRule) { + return errors.Errorf("start rule '%s' not found in any grammar file", g.config.StartRule) + } + + fmt.Printf("Generating %d queries from rule '%s'...\n", g.config.Count, g.config.StartRule) + + // Generate queries + for i := 0; i < g.config.Count; i++ { + query := g.generateQuery(i + 1) + fmt.Printf("Query %d: %s\n", i+1, query) + } + + return nil +} + +// hasRule checks if a rule exists in any of the parsed grammars +func (g *Generator) hasRule(ruleName string) bool { + for _, grammar := range g.grammars { + if grammar.GetRule(ruleName) != nil { + return true + } + } + return false +} + +// getRule gets a rule from any of the parsed grammars +func (g *Generator) getRule(ruleName string) *grammar.Rule { + for _, grammar := range g.grammars { + if rule := grammar.GetRule(ruleName); rule != nil { + return rule + } + } + return nil +} + +// generateQuery creates a single query using grammar rules +func (g *Generator) generateQuery(index int) string { + // Start generation from the specified start rule + result := g.generateFromRule(g.config.StartRule, 0) + return result +} + +// generateFromRule recursively generates text from a grammar rule +func (g *Generator) generateFromRule(ruleName string, currentDepth int) string { + // Check depth limit to prevent infinite recursion + if currentDepth >= g.config.MaxDepth { + return g.generateTerminal(ruleName) + } + + // Get the rule + rule := g.getRule(ruleName) + if rule == nil { + // If rule not found, return placeholder + return fmt.Sprintf("<%s>", ruleName) + } + + // Select a random alternative + if len(rule.Alternatives) == 0 { + return fmt.Sprintf("<%s>", ruleName) + } + + altIndex := g.random.Intn(len(rule.Alternatives)) + alternative := rule.Alternatives[altIndex] + + // Generate from all elements in the alternative + var result []string + for _, element := range alternative.Elements { + elementResult := g.generateFromElement(&element, currentDepth) + if elementResult != "" { + result = append(result, elementResult) + } + } + + return fmt.Sprintf("/* %s */ %s", ruleName, joinWithSpaces(result)) +} + +// generateFromElement generates text from a single grammar element +func (g *Generator) generateFromElement(element *grammar.Element, currentDepth int) string { + // Handle optional elements + if element.IsOptional() && g.random.Float64() > g.config.OptionalProb { + return "" // Skip optional element + } + + // Handle quantified elements + if element.IsQuantified() { + return g.generateQuantified(element, currentDepth) + } + + // Generate single element + if element.IsRule() { + return g.generateFromRule(element.Value, currentDepth+1) + } else if element.IsTerminal() { + return cleanLiteral(element.Value) + } + + return element.Value +} + +// generateQuantified handles quantified elements (* + {n,m}) +func (g *Generator) generateQuantified(element *grammar.Element, currentDepth int) string { + var count int + + switch element.Quantifier { + case grammar.ZERO_MORE: // * + count = g.random.Intn(g.config.MaxQuantifier + 1) // 0 to MaxQuantifier + case grammar.ONE_MORE: // + + count = 1 + g.random.Intn(g.config.MaxQuantifier) // 1 to MaxQuantifier + case grammar.RANGE: // {n,m} + if g.config.QuantifierCount > 0 { + count = g.config.QuantifierCount + } else { + rangeSize := element.Max - element.Min + 1 + count = element.Min + g.random.Intn(rangeSize) + } + default: + count = 1 + } + + var results []string + for i := 0; i < count; i++ { + if element.IsRule() { + result := g.generateFromRule(element.Value, currentDepth+1) + results = append(results, result) + } else if element.IsTerminal() { + results = append(results, cleanLiteral(element.Value)) + } + } + + return joinWithSpaces(results) +} + +// generateTerminal generates a terminal when depth limit is reached +func (g *Generator) generateTerminal(ruleName string) string { + // For depth-limited cases, return a simple placeholder + return fmt.Sprintf("<%s_TERM>", ruleName) +} + +// cleanLiteral removes quotes from literal strings +func cleanLiteral(literal string) string { + // Remove single quotes from literals like 'SELECT' + if len(literal) >= 2 && literal[0] == '\'' && literal[len(literal)-1] == '\'' { + return literal[1 : len(literal)-1] + } + return literal +} + +// joinWithSpaces joins strings with spaces, skipping empty strings +func joinWithSpaces(strs []string) string { + var nonEmpty []string + for _, s := range strs { + if s != "" { + nonEmpty = append(nonEmpty, s) + } + } + if len(nonEmpty) == 0 { + return "" + } + return joinStrings(nonEmpty, " ") +} + +// joinStrings joins strings with a separator +func joinStrings(strs []string, sep string) string { + if len(strs) == 0 { + return "" + } + if len(strs) == 1 { + return strs[0] + } + + result := strs[0] + for i := 1; i < len(strs); i++ { + result += sep + strs[i] + } + return result +} \ No newline at end of file diff --git a/tools/fuzzing/internal/grammar/discovery.go b/tools/fuzzing/internal/grammar/discovery.go new file mode 100644 index 0000000..9aa2f51 --- /dev/null +++ b/tools/fuzzing/internal/grammar/discovery.go @@ -0,0 +1,187 @@ +package grammar + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/pkg/errors" +) + +// GrammarFiles represents a pair of lexer and parser grammar files +type GrammarFiles struct { + LexerFile string + ParserFile string + Directory string +} + +// DiscoverGrammarFiles finds lexer and parser files for a given grammar name +func DiscoverGrammarFiles(grammarName string) (*GrammarFiles, error) { + // Start from fuzzing directory, go up to parser root + currentDir, err := os.Getwd() + if err != nil { + return nil, errors.Wrap(err, "failed to get current directory") + } + + // Navigate to parser root (assuming we're in tools/fuzzing) + repoRoot := filepath.Join(currentDir, "..", "..") + + // Try different grammar directory patterns + grammarDirs := []string{ + filepath.Join(repoRoot, grammarName), // Direct: postgresql/, cql/ + filepath.Join(repoRoot, "tools", "grammar"), // ANTLR v4 self-grammar + filepath.Join(repoRoot, "grammars", grammarName), // Alternative structure + } + + for _, dir := range grammarDirs { + if files, err := findGrammarFilesInDir(dir, grammarName); err == nil { + return files, nil + } + } + + return nil, errors.Errorf("grammar '%s' not found in any of the expected locations", grammarName) +} + +// findGrammarFilesInDir searches for grammar files in a specific directory +func findGrammarFilesInDir(dir, grammarName string) (*GrammarFiles, error) { + if _, err := os.Stat(dir); os.IsNotExist(err) { + return nil, errors.Errorf("directory does not exist: %s", dir) + } + + entries, err := os.ReadDir(dir) + if err != nil { + return nil, errors.Wrapf(err, "failed to read directory %s", dir) + } + + var lexerFile, parserFile string + + // Look for grammar files using different naming patterns + patterns := []struct { + lexerPattern string + parserPattern string + }{ + // Standard patterns: PostgreSQLLexer.g4, PostgreSQLParser.g4 + {fmt.Sprintf("%sLexer.g4", capitalize(grammarName)), fmt.Sprintf("%sParser.g4", capitalize(grammarName))}, + // Special case for PostgreSQL: postgresql -> PostgreSQL + {fmt.Sprintf("%sLexer.g4", strings.ToUpper(grammarName)), fmt.Sprintf("%sParser.g4", strings.ToUpper(grammarName))}, + // Alternate patterns: CqlLexer.g4, CqlParser.g4 + {fmt.Sprintf("%sLexer.g4", strings.Title(grammarName)), fmt.Sprintf("%sParser.g4", strings.Title(grammarName))}, + // Lowercase patterns: postgresql_lexer.g4, postgresql_parser.g4 + {fmt.Sprintf("%s_lexer.g4", strings.ToLower(grammarName)), fmt.Sprintf("%s_parser.g4", strings.ToLower(grammarName))}, + } + + // Special cases for known grammar naming conventions + switch strings.ToLower(grammarName) { + case "postgresql": + patterns = append(patterns, struct { + lexerPattern string + parserPattern string + }{"PostgreSQLLexer.g4", "PostgreSQLParser.g4"}) + case "antlrv4": + patterns = append(patterns, struct { + lexerPattern string + parserPattern string + }{"ANTLRv4Lexer.g4", "ANTLRv4Parser.g4"}) + } + + // Special case for ANTLR v4 self-grammar directory + if strings.Contains(dir, "tools/grammar") { + patterns = append(patterns, struct { + lexerPattern string + parserPattern string + }{"ANTLRv4Lexer.g4", "ANTLRv4Parser.g4"}) + } + + for _, entry := range entries { + if !entry.IsDir() && strings.HasSuffix(entry.Name(), ".g4") { + for _, pattern := range patterns { + if entry.Name() == pattern.lexerPattern { + lexerFile = filepath.Join(dir, entry.Name()) + } + if entry.Name() == pattern.parserPattern { + parserFile = filepath.Join(dir, entry.Name()) + } + } + } + } + + // Check if we found both files + if lexerFile == "" { + return nil, errors.Errorf("lexer file not found in %s", dir) + } + if parserFile == "" { + return nil, errors.Errorf("parser file not found in %s", dir) + } + + return &GrammarFiles{ + LexerFile: lexerFile, + ParserFile: parserFile, + Directory: dir, + }, nil +} + +// ListAvailableGrammars scans for all available grammar directories +func ListAvailableGrammars() ([]string, error) { + currentDir, err := os.Getwd() + if err != nil { + return nil, errors.Wrap(err, "failed to get current directory") + } + + repoRoot := filepath.Join(currentDir, "..", "..") + + var grammars []string + + // Scan for grammar directories + entries, err := os.ReadDir(repoRoot) + if err != nil { + return nil, errors.Wrap(err, "failed to read repository root") + } + + for _, entry := range entries { + if entry.IsDir() { + dirPath := filepath.Join(repoRoot, entry.Name()) + if hasGrammarFiles(dirPath) { + grammars = append(grammars, entry.Name()) + } + } + } + + // Add special case for ANTLR v4 self-grammar + if hasGrammarFiles(filepath.Join(repoRoot, "tools", "grammar")) { + grammars = append(grammars, "antlrv4") + } + + return grammars, nil +} + +// hasGrammarFiles checks if a directory contains .g4 files +func hasGrammarFiles(dir string) bool { + entries, err := os.ReadDir(dir) + if err != nil { + return false + } + + var hasLexer, hasParser bool + for _, entry := range entries { + if !entry.IsDir() && strings.HasSuffix(entry.Name(), ".g4") { + name := strings.ToLower(entry.Name()) + if strings.Contains(name, "lexer") { + hasLexer = true + } + if strings.Contains(name, "parser") { + hasParser = true + } + } + } + + return hasLexer && hasParser +} + +// capitalize capitalizes the first letter of a string, preserving the rest +func capitalize(s string) string { + if len(s) == 0 { + return s + } + return strings.ToUpper(s[:1]) + s[1:] +} \ No newline at end of file diff --git a/tools/fuzzing/internal/grammar/parser.go b/tools/fuzzing/internal/grammar/parser.go new file mode 100644 index 0000000..34cc05b --- /dev/null +++ b/tools/fuzzing/internal/grammar/parser.go @@ -0,0 +1,259 @@ +package grammar + +import ( + "fmt" + "os" + + "github.com/antlr4-go/antlr/v4" + "github.com/pkg/errors" + grammar "github.com/bytebase/parser/tools/grammar" +) + +// ParsedGrammar represents a parsed grammar with extracted rules +type ParsedGrammar struct { + LexerRules map[string]*Rule + ParserRules map[string]*Rule + FilePath string +} + +// Rule represents a grammar rule with its alternatives +type Rule struct { + Name string + Alternatives []Alternative + IsLexer bool +} + +// Alternative represents one alternative of a rule +type Alternative struct { + Elements []Element +} + +// Element represents an element within an alternative +type Element struct { + Type ElementType + Value string + Quantifier Quantifier + Min, Max int // for {n,m} quantifiers +} + +// ElementType indicates the type of grammar element +type ElementType int + +const ( + RULE_REF ElementType = iota + TOKEN_REF + LITERAL + OPTIONAL + QUANTIFIED +) + +// Quantifier indicates repetition type +type Quantifier int + +const ( + NONE Quantifier = iota + OPTIONAL_Q // ? + ZERO_MORE // * + ONE_MORE // + + RANGE // {n,m} +) + +// ParseGrammarFile parses a .g4 file and extracts rules for fuzzing +func ParseGrammarFile(filePath string) (*ParsedGrammar, error) { + // Read file content + content, err := os.ReadFile(filePath) + if err != nil { + return nil, errors.Wrap(err, "failed to read grammar file") + } + + if len(content) == 0 { + return nil, errors.New("grammar file is empty") + } + + // Create input stream + input := antlr.NewInputStream(string(content)) + + // Create lexer + lexer := grammar.NewANTLRv4Lexer(input) + + // Add error listener + errorListener := &GrammarErrorListener{} + lexer.RemoveErrorListeners() + lexer.AddErrorListener(errorListener) + + // Create token stream + stream := antlr.NewCommonTokenStream(lexer, 0) + + // Create parser + parser := grammar.NewANTLRv4Parser(stream) + + // Add error listener to parser + parser.RemoveErrorListeners() + parser.AddErrorListener(errorListener) + + // Parse the grammar + tree := parser.GrammarSpec() + + // Check for parsing errors + if errorListener.HasErrors() { + return nil, errors.Errorf("failed to parse grammar: %v", errorListener.GetErrors()) + } + + if tree == nil { + return nil, errors.New("parser returned nil tree") + } + + // Extract rules from parse tree + visitor := &GrammarExtractorVisitor{ + lexerRules: make(map[string]*Rule), + parserRules: make(map[string]*Rule), + } + + visitor.Visit(tree) + + return &ParsedGrammar{ + LexerRules: visitor.lexerRules, + ParserRules: visitor.parserRules, + FilePath: filePath, + }, nil +} + +// GetRule gets a rule by name from either lexer or parser rules +func (g *ParsedGrammar) GetRule(name string) *Rule { + if rule, ok := g.ParserRules[name]; ok { + return rule + } + if rule, ok := g.LexerRules[name]; ok { + return rule + } + return nil +} + +// GetAllRules returns all rules (both lexer and parser) +func (g *ParsedGrammar) GetAllRules() map[string]*Rule { + allRules := make(map[string]*Rule) + for name, rule := range g.LexerRules { + allRules[name] = rule + } + for name, rule := range g.ParserRules { + allRules[name] = rule + } + return allRules +} + +// IsRule checks if an element refers to another rule +func (e *Element) IsRule() bool { + return e.Type == RULE_REF || e.Type == TOKEN_REF +} + +// IsTerminal checks if an element is a terminal (literal) +func (e *Element) IsTerminal() bool { + return e.Type == LITERAL +} + +// IsOptional checks if an element has optional quantifier +func (e *Element) IsOptional() bool { + return e.Quantifier == OPTIONAL_Q +} + +// IsQuantified checks if an element has repetition quantifiers +func (e *Element) IsQuantified() bool { + return e.Quantifier == ZERO_MORE || e.Quantifier == ONE_MORE || e.Quantifier == RANGE +} + +// GrammarErrorListener collects parsing errors +type GrammarErrorListener struct { + errors []string +} + +func (l *GrammarErrorListener) SyntaxError(recognizer antlr.Recognizer, offendingSymbol interface{}, line, column int, msg string, e antlr.RecognitionException) { + l.errors = append(l.errors, fmt.Sprintf("line %d:%d %s", line, column, msg)) +} + +func (l *GrammarErrorListener) ReportAmbiguity(recognizer antlr.Parser, dfa *antlr.DFA, startIndex, stopIndex int, exact bool, ambigAlts *antlr.BitSet, configs *antlr.ATNConfigSet) { + // Ignore ambiguity for fuzzing purposes +} + +func (l *GrammarErrorListener) ReportAttemptingFullContext(recognizer antlr.Parser, dfa *antlr.DFA, startIndex, stopIndex int, conflictingAlts *antlr.BitSet, configs *antlr.ATNConfigSet) { + // Ignore for fuzzing purposes +} + +func (l *GrammarErrorListener) ReportContextSensitivity(recognizer antlr.Parser, dfa *antlr.DFA, startIndex, stopIndex, prediction int, configs *antlr.ATNConfigSet) { + // Ignore for fuzzing purposes +} + +func (l *GrammarErrorListener) HasErrors() bool { + return len(l.errors) > 0 +} + +func (l *GrammarErrorListener) GetErrors() []string { + return l.errors +} + +// GrammarExtractorVisitor extracts rules from the parse tree +type GrammarExtractorVisitor struct { + antlr.ParseTreeVisitor + lexerRules map[string]*Rule + parserRules map[string]*Rule + isLexer bool +} + +func (v *GrammarExtractorVisitor) Visit(tree antlr.ParseTree) interface{} { + // TODO: Implement tree visiting to extract rules + // This is a placeholder - we'll implement the actual visitor logic + // to walk the parse tree and extract rule information + + // For now, let's create a simple placeholder structure + v.extractPlaceholderRules() + + return nil +} + +// extractPlaceholderRules creates placeholder rules for testing +func (v *GrammarExtractorVisitor) extractPlaceholderRules() { + // Add some basic rules for testing + v.parserRules["selectStmt"] = &Rule{ + Name: "selectStmt", + IsLexer: false, + Alternatives: []Alternative{ + { + Elements: []Element{ + {Type: LITERAL, Value: "SELECT"}, + {Type: RULE_REF, Value: "columnList"}, + {Type: LITERAL, Value: "FROM"}, + {Type: RULE_REF, Value: "tableRef"}, + {Type: RULE_REF, Value: "whereClause", Quantifier: OPTIONAL_Q}, + }, + }, + }, + } + + v.parserRules["columnList"] = &Rule{ + Name: "columnList", + IsLexer: false, + Alternatives: []Alternative{ + { + Elements: []Element{ + {Type: RULE_REF, Value: "column"}, + { + Type: RULE_REF, + Value: "column", + Quantifier: ZERO_MORE, + }, + }, + }, + }, + } + + v.lexerRules["SELECT"] = &Rule{ + Name: "SELECT", + IsLexer: true, + Alternatives: []Alternative{ + { + Elements: []Element{ + {Type: LITERAL, Value: "'SELECT'"}, + }, + }, + }, + } +} \ No newline at end of file From 32bae30217f4e57a7a953a07bde212b39ca61c20 Mon Sep 17 00:00:00 2001 From: h3n4l Date: Tue, 26 Aug 2025 14:45:17 +0800 Subject: [PATCH 2/9] fix: remove range --- tools/fuzzing/DESIGN.md | 22 ++-------------- tools/fuzzing/README.md | 2 +- tools/fuzzing/internal/generator/generator.go | 26 +++++++++---------- tools/fuzzing/internal/grammar/parser.go | 4 +-- 4 files changed, 16 insertions(+), 38 deletions(-) diff --git a/tools/fuzzing/DESIGN.md b/tools/fuzzing/DESIGN.md index e78484f..5d05ad3 100644 --- a/tools/fuzzing/DESIGN.md +++ b/tools/fuzzing/DESIGN.md @@ -166,15 +166,7 @@ selectList: column (',' column)* // Generate 1 to N columns identifier: LETTER (LETTER | DIGIT)+ // Generate 1 to N characters ``` -**Exact Count (`rule{n}`):** -```antlr -hexDigit: HEX_DIGIT{4} // Generate exactly 4 hex digits -``` - -**Range Count (`rule{min,max}`):** -```antlr -varchar: CHAR{1,255} // Generate 1 to 255 characters -``` +**Note**: ANTLR v4 does not support `{n}` or `{n,m}` quantifier syntax. These are regex-style quantifiers not supported in ANTLR grammar files. #### Quantifier Control Strategy @@ -218,17 +210,7 @@ func (g *Generator) generateQuantified(element *GrammarElement, config Quantifie max := min(config.MaxRepeat, 50) count = g.selectCount(min, max, config.Strategy) - case "{n}": // Exact count - if config.Strategy == "fixed" { - count = config.FixedCount - } else { - count = element.ExactCount - } - - case "{min,max}": // Range - min := max(element.MinCount, config.MinRepeat) - max := min(element.MaxCount, config.MaxRepeat) - count = g.selectCount(min, max, config.Strategy) + // Note: ANTLR v4 does not support {n} or {min,max} syntax } result := "" diff --git a/tools/fuzzing/README.md b/tools/fuzzing/README.md index 64f4409..f496ff1 100644 --- a/tools/fuzzing/README.md +++ b/tools/fuzzing/README.md @@ -44,7 +44,7 @@ tools/fuzzing/ | `--count` | Number of queries to generate | 10 | | `--max-depth` | Maximum recursion depth | 5 | | `--optional-prob` | Probability of optional elements (0.0-1.0) | 0.5 | -| `--max-quantifier` | Maximum count for `*` and `+` rules | 5 | +| `--max-quantifier` | Maximum count for `*` and `+` quantifiers | 5 | | `--min-quantifier` | Minimum count override | 0 | | `--quantifier-count` | Fixed count for all quantifiers | 0 | | `--output` | Output file path | stdout | diff --git a/tools/fuzzing/internal/generator/generator.go b/tools/fuzzing/internal/generator/generator.go index 648cd77..d8eed47 100644 --- a/tools/fuzzing/internal/generator/generator.go +++ b/tools/fuzzing/internal/generator/generator.go @@ -138,24 +138,22 @@ func (g *Generator) generateFromElement(element *grammar.Element, currentDepth i return element.Value } -// generateQuantified handles quantified elements (* + {n,m}) +// generateQuantified handles quantified elements (* +) func (g *Generator) generateQuantified(element *grammar.Element, currentDepth int) string { var count int - switch element.Quantifier { - case grammar.ZERO_MORE: // * - count = g.random.Intn(g.config.MaxQuantifier + 1) // 0 to MaxQuantifier - case grammar.ONE_MORE: // + - count = 1 + g.random.Intn(g.config.MaxQuantifier) // 1 to MaxQuantifier - case grammar.RANGE: // {n,m} - if g.config.QuantifierCount > 0 { - count = g.config.QuantifierCount - } else { - rangeSize := element.Max - element.Min + 1 - count = element.Min + g.random.Intn(rangeSize) + // Use fixed count if specified, otherwise use random count + if g.config.QuantifierCount > 0 { + count = g.config.QuantifierCount + } else { + switch element.Quantifier { + case grammar.ZERO_MORE: // * + count = g.random.Intn(g.config.MaxQuantifier + 1) // 0 to MaxQuantifier + case grammar.ONE_MORE: // + + count = 1 + g.random.Intn(g.config.MaxQuantifier) // 1 to MaxQuantifier + default: + count = 1 } - default: - count = 1 } var results []string diff --git a/tools/fuzzing/internal/grammar/parser.go b/tools/fuzzing/internal/grammar/parser.go index 34cc05b..124cea0 100644 --- a/tools/fuzzing/internal/grammar/parser.go +++ b/tools/fuzzing/internal/grammar/parser.go @@ -33,7 +33,6 @@ type Element struct { Type ElementType Value string Quantifier Quantifier - Min, Max int // for {n,m} quantifiers } // ElementType indicates the type of grammar element @@ -55,7 +54,6 @@ const ( OPTIONAL_Q // ? ZERO_MORE // * ONE_MORE // + - RANGE // {n,m} ) // ParseGrammarFile parses a .g4 file and extracts rules for fuzzing @@ -158,7 +156,7 @@ func (e *Element) IsOptional() bool { // IsQuantified checks if an element has repetition quantifiers func (e *Element) IsQuantified() bool { - return e.Quantifier == ZERO_MORE || e.Quantifier == ONE_MORE || e.Quantifier == RANGE + return e.Quantifier == ZERO_MORE || e.Quantifier == ONE_MORE } // GrammarErrorListener collects parsing errors From a3c23e6b49c4c395cdbf884d11aaf813faba50a1 Mon Sep 17 00:00:00 2001 From: h3n4l Date: Thu, 28 Aug 2025 10:29:54 +0800 Subject: [PATCH 3/9] feat: parse grammar IR --- tools/fuzzing/internal/generator/generator.go | 54 ++- tools/fuzzing/internal/grammar/parser.go | 443 +++++++++++++++--- tools/fuzzing/internal/grammar/parser_test.go | 219 +++++++++ 3 files changed, 635 insertions(+), 81 deletions(-) create mode 100644 tools/fuzzing/internal/grammar/parser_test.go diff --git a/tools/fuzzing/internal/generator/generator.go b/tools/fuzzing/internal/generator/generator.go index d8eed47..207c677 100644 --- a/tools/fuzzing/internal/generator/generator.go +++ b/tools/fuzzing/internal/generator/generator.go @@ -130,12 +130,20 @@ func (g *Generator) generateFromElement(element *grammar.Element, currentDepth i // Generate single element if element.IsRule() { - return g.generateFromRule(element.Value, currentDepth+1) + if refValue, ok := element.Value.(grammar.ReferenceValue); ok { + return g.generateFromRule(refValue.Name, currentDepth+1) + } else if blockValue, ok := element.Value.(grammar.BlockValue); ok { + return g.generateFromBlock(blockValue, currentDepth) + } + return g.generateFromRule(element.Value.String(), currentDepth+1) } else if element.IsTerminal() { - return cleanLiteral(element.Value) + if litValue, ok := element.Value.(grammar.LiteralValue); ok { + return cleanLiteral(litValue.Text) + } + return cleanLiteral(element.Value.String()) } - return element.Value + return element.Value.String() } // generateQuantified handles quantified elements (* +) @@ -159,16 +167,50 @@ func (g *Generator) generateQuantified(element *grammar.Element, currentDepth in var results []string for i := 0; i < count; i++ { if element.IsRule() { - result := g.generateFromRule(element.Value, currentDepth+1) - results = append(results, result) + if refValue, ok := element.Value.(grammar.ReferenceValue); ok { + result := g.generateFromRule(refValue.Name, currentDepth+1) + results = append(results, result) + } else { + result := g.generateFromRule(element.Value.String(), currentDepth+1) + results = append(results, result) + } } else if element.IsTerminal() { - results = append(results, cleanLiteral(element.Value)) + if litValue, ok := element.Value.(grammar.LiteralValue); ok { + results = append(results, cleanLiteral(litValue.Text)) + } else { + results = append(results, cleanLiteral(element.Value.String())) + } + } else if blockValue, ok := element.Value.(grammar.BlockValue); ok { + result := g.generateFromBlock(blockValue, currentDepth+1) + results = append(results, result) } } return joinWithSpaces(results) } +// generateFromBlock generates content from a block value +func (g *Generator) generateFromBlock(blockValue grammar.BlockValue, currentDepth int) string { + if len(blockValue.Alternatives) == 0 { + return "" + } + + // Select a random alternative from the block + altIndex := g.random.Intn(len(blockValue.Alternatives)) + alternative := blockValue.Alternatives[altIndex] + + // Generate from all elements in the selected alternative + var result []string + for _, element := range alternative.Elements { + elementResult := g.generateFromElement(&element, currentDepth) + if elementResult != "" { + result = append(result, elementResult) + } + } + + return joinWithSpaces(result) +} + // generateTerminal generates a terminal when depth limit is reached func (g *Generator) generateTerminal(ruleName string) string { // For depth-limited cases, return a simple placeholder diff --git a/tools/fuzzing/internal/grammar/parser.go b/tools/fuzzing/internal/grammar/parser.go index 124cea0..0e24435 100644 --- a/tools/fuzzing/internal/grammar/parser.go +++ b/tools/fuzzing/internal/grammar/parser.go @@ -3,6 +3,7 @@ package grammar import ( "fmt" "os" + "strings" "github.com/antlr4-go/antlr/v4" "github.com/pkg/errors" @@ -14,6 +15,9 @@ type ParsedGrammar struct { LexerRules map[string]*Rule ParserRules map[string]*Rule FilePath string + // BlockAltMap stores temporary block rules for debugging + // Key: block ID (e.g., "block_1_alts"), Value: the block alternatives + BlockAltMap map[string][]Alternative } // Rule represents a grammar rule with its alternatives @@ -28,24 +32,61 @@ type Alternative struct { Elements []Element } +// Global block ID counter for generating unique block names +var globalBlockID = 0 + +// ElementValue represents different types of element values +type ElementValue interface { + // String returns a string representation for display/debugging + String() string +} + +// LiteralValue represents a literal string (e.g., 'SELECT') +type LiteralValue struct { + Text string +} + +func (l LiteralValue) String() string { return l.Text } + +// ReferenceValue represents a reference to a rule or token (e.g., IDENTIFIER, selectStmt) +type ReferenceValue struct { + Name string +} + +func (r ReferenceValue) String() string { return r.Name } + +// BlockValue represents a generated block (e.g., (',' column)*) +type BlockValue struct { + ID string // Global unique ID like "block_1_alts" + Alternatives []Alternative +} + +func (b BlockValue) String() string { + if len(b.Alternatives) == 0 { + return "" + } + if len(b.Alternatives) == 1 { + elements := []string{} + for _, elem := range b.Alternatives[0].Elements { + elements = append(elements, elem.Value.String()) + } + return fmt.Sprintf("(%s)", strings.Join(elements, " ")) + } + return b.ID +} + + +// WildcardValue represents a wildcard (.) +type WildcardValue struct{} + +func (w WildcardValue) String() string { return "." } + // Element represents an element within an alternative type Element struct { - Type ElementType - Value string + Value ElementValue Quantifier Quantifier } -// ElementType indicates the type of grammar element -type ElementType int - -const ( - RULE_REF ElementType = iota - TOKEN_REF - LITERAL - OPTIONAL - QUANTIFIED -) - // Quantifier indicates repetition type type Quantifier int @@ -102,17 +143,16 @@ func ParseGrammarFile(filePath string) (*ParsedGrammar, error) { } // Extract rules from parse tree - visitor := &GrammarExtractorVisitor{ - lexerRules: make(map[string]*Rule), - parserRules: make(map[string]*Rule), - } + visitor := NewGrammarExtractorVisitor() + visitor.VisitGrammarSpec(tree) + - visitor.Visit(tree) return &ParsedGrammar{ LexerRules: visitor.lexerRules, ParserRules: visitor.parserRules, FilePath: filePath, + BlockAltMap: visitor.blockAltMap, }, nil } @@ -139,14 +179,30 @@ func (g *ParsedGrammar) GetAllRules() map[string]*Rule { return allRules } -// IsRule checks if an element refers to another rule +// GetBlockAlternatives returns the alternatives for a generated block ID +func (g *ParsedGrammar) GetBlockAlternatives(blockID string) ([]Alternative, bool) { + alts, exists := g.BlockAltMap[blockID] + return alts, exists +} + +// IsGeneratedBlock checks if a name refers to a generated block +func (g *ParsedGrammar) IsGeneratedBlock(name string) bool { + _, exists := g.BlockAltMap[name] + return exists +} + +// IsRule checks if an element refers to another rule or generated block func (e *Element) IsRule() bool { - return e.Type == RULE_REF || e.Type == TOKEN_REF + _, isRef := e.Value.(ReferenceValue) + _, isBlock := e.Value.(BlockValue) + return isRef || isBlock } // IsTerminal checks if an element is a terminal (literal) func (e *Element) IsTerminal() bool { - return e.Type == LITERAL + _, isLit := e.Value.(LiteralValue) + _, isWild := e.Value.(WildcardValue) + return isLit || isWild } // IsOptional checks if an element has optional quantifier @@ -190,68 +246,305 @@ func (l *GrammarErrorListener) GetErrors() []string { // GrammarExtractorVisitor extracts rules from the parse tree type GrammarExtractorVisitor struct { - antlr.ParseTreeVisitor + *grammar.BaseANTLRv4ParserVisitor lexerRules map[string]*Rule parserRules map[string]*Rule - isLexer bool + blockAltMap map[string][]Alternative } -func (v *GrammarExtractorVisitor) Visit(tree antlr.ParseTree) interface{} { - // TODO: Implement tree visiting to extract rules - // This is a placeholder - we'll implement the actual visitor logic - // to walk the parse tree and extract rule information - - // For now, let's create a simple placeholder structure - v.extractPlaceholderRules() - +// NewGrammarExtractorVisitor creates a new visitor +func NewGrammarExtractorVisitor() *GrammarExtractorVisitor { + v := &GrammarExtractorVisitor{ + BaseANTLRv4ParserVisitor: &grammar.BaseANTLRv4ParserVisitor{}, + lexerRules: make(map[string]*Rule), + parserRules: make(map[string]*Rule), + blockAltMap: make(map[string][]Alternative), + } + return v +} + +// VisitGrammarSpec visits the grammar specification +func (v *GrammarExtractorVisitor) VisitGrammarSpec(ctx grammar.IGrammarSpecContext) interface{} { + // Visit rules section + if rulesCtx := ctx.Rules(); rulesCtx != nil { + v.VisitRules(rulesCtx) + } + return nil +} + +// VisitRules visits the rules section +func (v *GrammarExtractorVisitor) VisitRules(ctx grammar.IRulesContext) interface{} { + // Visit all rule specifications + for _, ruleSpecCtx := range ctx.AllRuleSpec() { + v.VisitRuleSpec(ruleSpecCtx) + } + return nil +} + +// VisitRuleSpec visits a rule specification (could be parser or lexer rule) +func (v *GrammarExtractorVisitor) VisitRuleSpec(ctx grammar.IRuleSpecContext) interface{} { + // Focus only on parser rules for now + if parserRuleCtx := ctx.ParserRuleSpec(); parserRuleCtx != nil { + v.VisitParserRuleSpec(parserRuleCtx) + } + // Skip lexer rules for now + return nil +} + +// VisitParserRuleSpec visits a parser rule specification +func (v *GrammarExtractorVisitor) VisitParserRuleSpec(ctx grammar.IParserRuleSpecContext) interface{} { + // Get rule name + ruleNameToken := ctx.RULE_REF() + if ruleNameToken == nil { + return nil + } + ruleName := ruleNameToken.GetText() + + // Get rule block (alternatives) + ruleBlockCtx := ctx.RuleBlock() + if ruleBlockCtx == nil { + return nil + } + + // Extract alternatives + alternatives := v.extractAlternatives(ruleBlockCtx) + + // Create rule + rule := &Rule{ + Name: ruleName, + IsLexer: false, + Alternatives: alternatives, + } + + // Store rule + v.parserRules[ruleName] = rule + + return nil +} + +// extractAlternatives extracts alternatives from a rule block +func (v *GrammarExtractorVisitor) extractAlternatives(ruleBlockCtx grammar.IRuleBlockContext) []Alternative { + var alternatives []Alternative + + // Get rule alternative list + ruleAltListCtx := ruleBlockCtx.RuleAltList() + if ruleAltListCtx == nil { + return alternatives + } + + // Process each labeled alternative + for _, labeledAltCtx := range ruleAltListCtx.AllLabeledAlt() { + alternative := v.extractAlternative(labeledAltCtx) + alternatives = append(alternatives, alternative) + } + + return alternatives +} + +// extractAlternative extracts a single alternative +func (v *GrammarExtractorVisitor) extractAlternative(labeledAltCtx grammar.ILabeledAltContext) Alternative { + var elements []Element + + // Get alternative context + altCtx := labeledAltCtx.Alternative() + if altCtx != nil { + // Process each element in the alternative + for _, elementCtx := range altCtx.AllElement() { + element := v.extractElement(elementCtx) + if element != nil { + elements = append(elements, *element) + } + } + } + + return Alternative{ + Elements: elements, + } +} + +// extractElement extracts an element from an element context +func (v *GrammarExtractorVisitor) extractElement(elementCtx grammar.IElementContext) *Element { + // Handle labeled elements + if labeledElementCtx := elementCtx.LabeledElement(); labeledElementCtx != nil { + return v.extractLabeledElement(labeledElementCtx) + } + + // Handle atoms (terminals/non-terminals) + if atomCtx := elementCtx.Atom(); atomCtx != nil { + element := v.extractAtom(atomCtx) + // Check for quantifiers + if element != nil { + element.Quantifier = v.extractQuantifier(elementCtx.EbnfSuffix()) + } + return element + } + + // Handle EBNF constructs (blocks with quantifiers) + if ebnfCtx := elementCtx.Ebnf(); ebnfCtx != nil { + return v.extractEbnf(ebnfCtx) + } + return nil } -// extractPlaceholderRules creates placeholder rules for testing -func (v *GrammarExtractorVisitor) extractPlaceholderRules() { - // Add some basic rules for testing - v.parserRules["selectStmt"] = &Rule{ - Name: "selectStmt", - IsLexer: false, - Alternatives: []Alternative{ - { - Elements: []Element{ - {Type: LITERAL, Value: "SELECT"}, - {Type: RULE_REF, Value: "columnList"}, - {Type: LITERAL, Value: "FROM"}, - {Type: RULE_REF, Value: "tableRef"}, - {Type: RULE_REF, Value: "whereClause", Quantifier: OPTIONAL_Q}, - }, - }, - }, +// extractLabeledElement extracts a labeled element (e.g., label=atom) +func (v *GrammarExtractorVisitor) extractLabeledElement(labeledElementCtx grammar.ILabeledElementContext) *Element { + // For now, just extract the atom part and ignore the label + if atomCtx := labeledElementCtx.Atom(); atomCtx != nil { + return v.extractAtom(atomCtx) } + if blockCtx := labeledElementCtx.Block(); blockCtx != nil { + return v.extractBlock(blockCtx) + } + return nil +} + +// extractAtom extracts an atom (terminal or non-terminal) +func (v *GrammarExtractorVisitor) extractAtom(atomCtx grammar.IAtomContext) *Element { + // Handle terminal definition (string literal or token reference) + if terminalDefCtx := atomCtx.TerminalDef(); terminalDefCtx != nil { + return v.extractTerminalDef(terminalDefCtx) + } + + // Handle rule reference + if rulerefCtx := atomCtx.Ruleref(); rulerefCtx != nil { + return v.extractRuleRef(rulerefCtx) + } + + // Handle wildcard (.) + if wildcardCtx := atomCtx.Wildcard(); wildcardCtx != nil { + return &Element{ + Value: WildcardValue{}, + } + } + + // Handle not sets, ranges, etc. - for now just return nil + return nil +} + +// extractTerminalDef extracts a terminal definition (literal string or token reference) +func (v *GrammarExtractorVisitor) extractTerminalDef(terminalDefCtx grammar.ITerminalDefContext) *Element { + if stringLiteralToken := terminalDefCtx.STRING_LITERAL(); stringLiteralToken != nil { + return &Element{ + Value: LiteralValue{Text: stringLiteralToken.GetText()}, + } + } + if tokenRefToken := terminalDefCtx.TOKEN_REF(); tokenRefToken != nil { + return &Element{ + Value: ReferenceValue{Name: tokenRefToken.GetText()}, + } + } + return nil +} + + +// extractRuleRef extracts a rule reference +func (v *GrammarExtractorVisitor) extractRuleRef(rulerefCtx grammar.IRulerefContext) *Element { + if ruleRefToken := rulerefCtx.RULE_REF(); ruleRefToken != nil { + return &Element{ + Value: ReferenceValue{Name: ruleRefToken.GetText()}, + } + } + return nil +} + +// extractBlock extracts a block (grouped alternatives) +func (v *GrammarExtractorVisitor) extractBlock(blockCtx grammar.IBlockContext) *Element { + // Get the alternative list from the block + altListCtx := blockCtx.AltList() + if altListCtx == nil { + globalBlockID++ + blockID := fmt.Sprintf("block_%d_alts", globalBlockID) + emptyAlts := []Alternative{} + v.blockAltMap[blockID] = emptyAlts + + return &Element{ + Value: BlockValue{ID: blockID, Alternatives: emptyAlts}, + } + } + + // Extract all alternatives from the block + alts := altListCtx.AllAlternative() + if len(alts) == 0 { + globalBlockID++ + blockID := fmt.Sprintf("block_%d_alts", globalBlockID) + emptyAlts := []Alternative{} + v.blockAltMap[blockID] = emptyAlts + + return &Element{ + Value: BlockValue{ID: blockID, Alternatives: emptyAlts}, + } + } + + // Extract all alternatives + blockAlternatives := []Alternative{} + for _, altCtx := range alts { + elements := []Element{} + for _, elementCtx := range altCtx.AllElement() { + element := v.extractElement(elementCtx) + if element != nil { + elements = append(elements, *element) + } + } + blockAlternatives = append(blockAlternatives, Alternative{Elements: elements}) + } + + // If it's a single element in a single alternative, we can simplify + if len(blockAlternatives) == 1 && len(blockAlternatives[0].Elements) == 1 { + return &blockAlternatives[0].Elements[0] + } + + // Generate global unique block ID and store mapping + globalBlockID++ + blockID := fmt.Sprintf("block_%d_alts", globalBlockID) + v.blockAltMap[blockID] = blockAlternatives - v.parserRules["columnList"] = &Rule{ - Name: "columnList", - IsLexer: false, - Alternatives: []Alternative{ - { - Elements: []Element{ - {Type: RULE_REF, Value: "column"}, - { - Type: RULE_REF, - Value: "column", - Quantifier: ZERO_MORE, - }, - }, - }, - }, - } - - v.lexerRules["SELECT"] = &Rule{ - Name: "SELECT", - IsLexer: true, - Alternatives: []Alternative{ - { - Elements: []Element{ - {Type: LITERAL, Value: "'SELECT'"}, - }, - }, - }, + return &Element{ + Value: BlockValue{ID: blockID, Alternatives: blockAlternatives}, + } +} + +// extractEbnf extracts EBNF constructs (blocks with suffixes) +func (v *GrammarExtractorVisitor) extractEbnf(ebnfCtx grammar.IEbnfContext) *Element { + // Get the block + blockCtx := ebnfCtx.Block() + if blockCtx == nil { + return nil + } + + element := v.extractBlock(blockCtx) + if element != nil { + // Apply quantifier from block suffix + if blockSuffixCtx := ebnfCtx.BlockSuffix(); blockSuffixCtx != nil { + if ebnfSuffixCtx := blockSuffixCtx.EbnfSuffix(); ebnfSuffixCtx != nil { + element.Quantifier = v.extractQuantifier(ebnfSuffixCtx) + } + } + } + + return element +} + +// extractQuantifier extracts quantifier from EBNF suffix +func (v *GrammarExtractorVisitor) extractQuantifier(ebnfSuffixCtx grammar.IEbnfSuffixContext) Quantifier { + if ebnfSuffixCtx == nil { + return NONE } + + // Check for question mark (optional) + if ebnfSuffixCtx.QUESTION(0) != nil { + return OPTIONAL_Q + } + + // Check for star (zero or more) + if ebnfSuffixCtx.STAR() != nil { + return ZERO_MORE + } + + // Check for plus (one or more) + if ebnfSuffixCtx.PLUS() != nil { + return ONE_MORE + } + + return NONE } \ No newline at end of file diff --git a/tools/fuzzing/internal/grammar/parser_test.go b/tools/fuzzing/internal/grammar/parser_test.go new file mode 100644 index 0000000..12ffa54 --- /dev/null +++ b/tools/fuzzing/internal/grammar/parser_test.go @@ -0,0 +1,219 @@ +package grammar + +import ( + "os" + "path/filepath" + "testing" +) + +// TestCompleteGrammarIR tests the complete intermediate representation of parsed grammar +func TestCompleteGrammarIR(t *testing.T) { + grammarContent := ` +parser grammar CompleteIRTest; + +// Simple rule with literals +greeting: 'Hello' 'World'; + +// Rule with alternatives +statement: selectStmt | insertStmt | 'DELETE'; + +// Rule with quantifiers and mixed elements +selectStmt: 'SELECT' columnList 'FROM' IDENTIFIER whereClause?; + +// Rule with quantified elements +columnList: column (',' column)*; + +// Rule with token reference +column: IDENTIFIER ('AS' IDENTIFIER)?; + +// Rule with optional and alternatives +whereClause: 'WHERE' expr; + +// Complex rule with multiple alternatives and quantifiers +expr: expr '+' expr + | expr '*' expr + | '(' expr ')' + | IDENTIFIER + | NUMBER; +` + + tmpFile := createTempGrammarFile(t, grammarContent) + defer os.Remove(tmpFile) + + grammar, err := ParseGrammarFile(tmpFile) + if err != nil { + t.Fatalf("Failed to parse grammar: %v", err) + } + + // Basic grammar properties + if grammar == nil { + t.Fatal("Grammar is nil") + } + if grammar.FilePath != tmpFile { + t.Errorf("Expected file path %s, got %s", tmpFile, grammar.FilePath) + } + if len(grammar.LexerRules) != 0 { + t.Errorf("Expected 0 lexer rules, got %d", len(grammar.LexerRules)) + } + if len(grammar.ParserRules) != 7 { + t.Errorf("Expected 7 parser rules, got %d", len(grammar.ParserRules)) + } + + // Test cases for rule validation + tests := []struct { + ruleName string + alternatives int + elements []elementTest + }{ + { + ruleName: "greeting", + alternatives: 1, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "'Hello'", quantifier: NONE, elementType: "literal"}, + {altIndex: 0, elementIndex: 1, value: "'World'", quantifier: NONE, elementType: "literal"}, + }, + }, + { + ruleName: "statement", + alternatives: 3, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "selectStmt", quantifier: NONE, elementType: "reference"}, + {altIndex: 1, elementIndex: 0, value: "insertStmt", quantifier: NONE, elementType: "reference"}, + {altIndex: 2, elementIndex: 0, value: "'DELETE'", quantifier: NONE, elementType: "literal"}, + }, + }, + { + ruleName: "selectStmt", + alternatives: 1, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "'SELECT'", quantifier: NONE, elementType: "literal"}, + {altIndex: 0, elementIndex: 1, value: "columnList", quantifier: NONE, elementType: "reference"}, + {altIndex: 0, elementIndex: 2, value: "'FROM'", quantifier: NONE, elementType: "literal"}, + {altIndex: 0, elementIndex: 3, value: "IDENTIFIER", quantifier: NONE, elementType: "reference"}, + {altIndex: 0, elementIndex: 4, value: "whereClause", quantifier: OPTIONAL_Q, elementType: "reference"}, + }, + }, + { + ruleName: "columnList", + alternatives: 1, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "column", quantifier: NONE, elementType: "reference"}, + {altIndex: 0, elementIndex: 1, value: "(',' column)", quantifier: ZERO_MORE, elementType: "block"}, + }, + }, + { + ruleName: "column", + alternatives: 1, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "IDENTIFIER", quantifier: NONE, elementType: "reference"}, + {altIndex: 0, elementIndex: 1, value: "('AS' IDENTIFIER)", quantifier: OPTIONAL_Q, elementType: "block"}, + }, + }, + { + ruleName: "whereClause", + alternatives: 1, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "'WHERE'", quantifier: NONE, elementType: "literal"}, + {altIndex: 0, elementIndex: 1, value: "expr", quantifier: NONE, elementType: "reference"}, + }, + }, + { + ruleName: "expr", + alternatives: 5, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "expr", quantifier: NONE, elementType: "reference"}, + {altIndex: 0, elementIndex: 1, value: "'+'", quantifier: NONE, elementType: "literal"}, + {altIndex: 0, elementIndex: 2, value: "expr", quantifier: NONE, elementType: "reference"}, + {altIndex: 1, elementIndex: 1, value: "'*'", quantifier: NONE, elementType: "literal"}, + {altIndex: 2, elementIndex: 0, value: "'('", quantifier: NONE, elementType: "literal"}, + {altIndex: 2, elementIndex: 1, value: "expr", quantifier: NONE, elementType: "reference"}, + {altIndex: 2, elementIndex: 2, value: "')'", quantifier: NONE, elementType: "literal"}, + {altIndex: 3, elementIndex: 0, value: "IDENTIFIER", quantifier: NONE, elementType: "reference"}, + {altIndex: 4, elementIndex: 0, value: "NUMBER", quantifier: NONE, elementType: "reference"}, + }, + }, + } + + for _, tc := range tests { + t.Run(tc.ruleName, func(t *testing.T) { + rule := grammar.GetRule(tc.ruleName) + if rule == nil { + t.Fatalf("rule %s not found", tc.ruleName) + } + if rule.Name != tc.ruleName || rule.IsLexer { + t.Errorf("rule %s has incorrect metadata", tc.ruleName) + } + if len(rule.Alternatives) != tc.alternatives { + t.Errorf("%s: expected %d alternatives, got %d", tc.ruleName, tc.alternatives, len(rule.Alternatives)) + } + + for _, elem := range tc.elements { + altIndex := elem.altIndex + elementIndex := elem.elementIndex + + if altIndex >= len(rule.Alternatives) { + t.Errorf("%s: alternative %d out of range", tc.ruleName, altIndex) + continue + } + + elements := rule.Alternatives[altIndex].Elements + if elementIndex >= len(elements) { + t.Errorf("%s alt %d: element %d out of range", tc.ruleName, altIndex, elementIndex) + continue + } + + element := elements[elementIndex] + if elem.value != "" && element.Value.String() != elem.value { + t.Errorf("%s alt %d elem %d: expected value %s, got %s", tc.ruleName, altIndex, elementIndex, elem.value, element.Value.String()) + } + if element.Quantifier != elem.quantifier { + t.Errorf("%s alt %d elem %d: expected quantifier %v, got %v", tc.ruleName, altIndex, elementIndex, elem.quantifier, element.Quantifier) + } + + // Validate element type using type assertions + switch elem.elementType { + case "literal": + if _, ok := element.Value.(LiteralValue); !ok { + t.Errorf("%s alt %d elem %d: expected LiteralValue, got %T", tc.ruleName, altIndex, elementIndex, element.Value) + } + case "reference": + if _, ok := element.Value.(ReferenceValue); !ok { + t.Errorf("%s alt %d elem %d: expected ReferenceValue, got %T", tc.ruleName, altIndex, elementIndex, element.Value) + } + case "block": + if _, ok := element.Value.(BlockValue); !ok { + t.Errorf("%s alt %d elem %d: expected BlockValue, got %T", tc.ruleName, altIndex, elementIndex, element.Value) + } + } + } + }) + } + + // Test GetAllRules method + allRules := grammar.GetAllRules() + if len(allRules) != 7 { + t.Errorf("GetAllRules: expected 7 rules, got %d", len(allRules)) + } +} + +type elementTest struct { + altIndex int + elementIndex int + value string + quantifier Quantifier + elementType string // "literal", "reference", or "block" +} + +// Helper functions + +func createTempGrammarFile(t *testing.T, content string) string { + tmpDir := os.TempDir() + tmpFile := filepath.Join(tmpDir, "test_grammar.g4") + + err := os.WriteFile(tmpFile, []byte(content), 0644) + if err != nil { + t.Fatalf("Failed to create temp grammar file: %v", err) + } + + return tmpFile +} \ No newline at end of file From 56efd1424bf7f578e5ffffb0579be657c427da22 Mon Sep 17 00:00:00 2001 From: h3n4l Date: Thu, 28 Aug 2025 10:51:51 +0800 Subject: [PATCH 4/9] feat: lexer parser v1 --- tools/fuzzing/internal/grammar/discovery.go | 187 --------------- tools/fuzzing/internal/grammar/parser.go | 220 +++++++++++++++++- tools/fuzzing/internal/grammar/parser_test.go | 191 +++++++++++++++ 3 files changed, 409 insertions(+), 189 deletions(-) delete mode 100644 tools/fuzzing/internal/grammar/discovery.go diff --git a/tools/fuzzing/internal/grammar/discovery.go b/tools/fuzzing/internal/grammar/discovery.go deleted file mode 100644 index 9aa2f51..0000000 --- a/tools/fuzzing/internal/grammar/discovery.go +++ /dev/null @@ -1,187 +0,0 @@ -package grammar - -import ( - "fmt" - "os" - "path/filepath" - "strings" - - "github.com/pkg/errors" -) - -// GrammarFiles represents a pair of lexer and parser grammar files -type GrammarFiles struct { - LexerFile string - ParserFile string - Directory string -} - -// DiscoverGrammarFiles finds lexer and parser files for a given grammar name -func DiscoverGrammarFiles(grammarName string) (*GrammarFiles, error) { - // Start from fuzzing directory, go up to parser root - currentDir, err := os.Getwd() - if err != nil { - return nil, errors.Wrap(err, "failed to get current directory") - } - - // Navigate to parser root (assuming we're in tools/fuzzing) - repoRoot := filepath.Join(currentDir, "..", "..") - - // Try different grammar directory patterns - grammarDirs := []string{ - filepath.Join(repoRoot, grammarName), // Direct: postgresql/, cql/ - filepath.Join(repoRoot, "tools", "grammar"), // ANTLR v4 self-grammar - filepath.Join(repoRoot, "grammars", grammarName), // Alternative structure - } - - for _, dir := range grammarDirs { - if files, err := findGrammarFilesInDir(dir, grammarName); err == nil { - return files, nil - } - } - - return nil, errors.Errorf("grammar '%s' not found in any of the expected locations", grammarName) -} - -// findGrammarFilesInDir searches for grammar files in a specific directory -func findGrammarFilesInDir(dir, grammarName string) (*GrammarFiles, error) { - if _, err := os.Stat(dir); os.IsNotExist(err) { - return nil, errors.Errorf("directory does not exist: %s", dir) - } - - entries, err := os.ReadDir(dir) - if err != nil { - return nil, errors.Wrapf(err, "failed to read directory %s", dir) - } - - var lexerFile, parserFile string - - // Look for grammar files using different naming patterns - patterns := []struct { - lexerPattern string - parserPattern string - }{ - // Standard patterns: PostgreSQLLexer.g4, PostgreSQLParser.g4 - {fmt.Sprintf("%sLexer.g4", capitalize(grammarName)), fmt.Sprintf("%sParser.g4", capitalize(grammarName))}, - // Special case for PostgreSQL: postgresql -> PostgreSQL - {fmt.Sprintf("%sLexer.g4", strings.ToUpper(grammarName)), fmt.Sprintf("%sParser.g4", strings.ToUpper(grammarName))}, - // Alternate patterns: CqlLexer.g4, CqlParser.g4 - {fmt.Sprintf("%sLexer.g4", strings.Title(grammarName)), fmt.Sprintf("%sParser.g4", strings.Title(grammarName))}, - // Lowercase patterns: postgresql_lexer.g4, postgresql_parser.g4 - {fmt.Sprintf("%s_lexer.g4", strings.ToLower(grammarName)), fmt.Sprintf("%s_parser.g4", strings.ToLower(grammarName))}, - } - - // Special cases for known grammar naming conventions - switch strings.ToLower(grammarName) { - case "postgresql": - patterns = append(patterns, struct { - lexerPattern string - parserPattern string - }{"PostgreSQLLexer.g4", "PostgreSQLParser.g4"}) - case "antlrv4": - patterns = append(patterns, struct { - lexerPattern string - parserPattern string - }{"ANTLRv4Lexer.g4", "ANTLRv4Parser.g4"}) - } - - // Special case for ANTLR v4 self-grammar directory - if strings.Contains(dir, "tools/grammar") { - patterns = append(patterns, struct { - lexerPattern string - parserPattern string - }{"ANTLRv4Lexer.g4", "ANTLRv4Parser.g4"}) - } - - for _, entry := range entries { - if !entry.IsDir() && strings.HasSuffix(entry.Name(), ".g4") { - for _, pattern := range patterns { - if entry.Name() == pattern.lexerPattern { - lexerFile = filepath.Join(dir, entry.Name()) - } - if entry.Name() == pattern.parserPattern { - parserFile = filepath.Join(dir, entry.Name()) - } - } - } - } - - // Check if we found both files - if lexerFile == "" { - return nil, errors.Errorf("lexer file not found in %s", dir) - } - if parserFile == "" { - return nil, errors.Errorf("parser file not found in %s", dir) - } - - return &GrammarFiles{ - LexerFile: lexerFile, - ParserFile: parserFile, - Directory: dir, - }, nil -} - -// ListAvailableGrammars scans for all available grammar directories -func ListAvailableGrammars() ([]string, error) { - currentDir, err := os.Getwd() - if err != nil { - return nil, errors.Wrap(err, "failed to get current directory") - } - - repoRoot := filepath.Join(currentDir, "..", "..") - - var grammars []string - - // Scan for grammar directories - entries, err := os.ReadDir(repoRoot) - if err != nil { - return nil, errors.Wrap(err, "failed to read repository root") - } - - for _, entry := range entries { - if entry.IsDir() { - dirPath := filepath.Join(repoRoot, entry.Name()) - if hasGrammarFiles(dirPath) { - grammars = append(grammars, entry.Name()) - } - } - } - - // Add special case for ANTLR v4 self-grammar - if hasGrammarFiles(filepath.Join(repoRoot, "tools", "grammar")) { - grammars = append(grammars, "antlrv4") - } - - return grammars, nil -} - -// hasGrammarFiles checks if a directory contains .g4 files -func hasGrammarFiles(dir string) bool { - entries, err := os.ReadDir(dir) - if err != nil { - return false - } - - var hasLexer, hasParser bool - for _, entry := range entries { - if !entry.IsDir() && strings.HasSuffix(entry.Name(), ".g4") { - name := strings.ToLower(entry.Name()) - if strings.Contains(name, "lexer") { - hasLexer = true - } - if strings.Contains(name, "parser") { - hasParser = true - } - } - } - - return hasLexer && hasParser -} - -// capitalize capitalizes the first letter of a string, preserving the rest -func capitalize(s string) string { - if len(s) == 0 { - return s - } - return strings.ToUpper(s[:1]) + s[1:] -} \ No newline at end of file diff --git a/tools/fuzzing/internal/grammar/parser.go b/tools/fuzzing/internal/grammar/parser.go index 0e24435..70ceec2 100644 --- a/tools/fuzzing/internal/grammar/parser.go +++ b/tools/fuzzing/internal/grammar/parser.go @@ -283,11 +283,14 @@ func (v *GrammarExtractorVisitor) VisitRules(ctx grammar.IRulesContext) interfac // VisitRuleSpec visits a rule specification (could be parser or lexer rule) func (v *GrammarExtractorVisitor) VisitRuleSpec(ctx grammar.IRuleSpecContext) interface{} { - // Focus only on parser rules for now + // Handle parser rules if parserRuleCtx := ctx.ParserRuleSpec(); parserRuleCtx != nil { v.VisitParserRuleSpec(parserRuleCtx) } - // Skip lexer rules for now + // Handle lexer rules + if lexerRuleCtx := ctx.LexerRuleSpec(); lexerRuleCtx != nil { + v.VisitLexerRuleSpec(lexerRuleCtx) + } return nil } @@ -322,6 +325,37 @@ func (v *GrammarExtractorVisitor) VisitParserRuleSpec(ctx grammar.IParserRuleSpe return nil } +// VisitLexerRuleSpec visits a lexer rule specification +func (v *GrammarExtractorVisitor) VisitLexerRuleSpec(ctx grammar.ILexerRuleSpecContext) interface{} { + // Get rule name + ruleNameToken := ctx.TOKEN_REF() + if ruleNameToken == nil { + return nil + } + ruleName := ruleNameToken.GetText() + + // Get lexer rule block (alternatives) + lexerRuleBlockCtx := ctx.LexerRuleBlock() + if lexerRuleBlockCtx == nil { + return nil + } + + // Extract alternatives from lexer rule block + alternatives := v.extractLexerAlternatives(lexerRuleBlockCtx) + + // Create rule + rule := &Rule{ + Name: ruleName, + IsLexer: true, + Alternatives: alternatives, + } + + // Store rule + v.lexerRules[ruleName] = rule + + return nil +} + // extractAlternatives extracts alternatives from a rule block func (v *GrammarExtractorVisitor) extractAlternatives(ruleBlockCtx grammar.IRuleBlockContext) []Alternative { var alternatives []Alternative @@ -341,6 +375,46 @@ func (v *GrammarExtractorVisitor) extractAlternatives(ruleBlockCtx grammar.IRule return alternatives } +// extractLexerAlternatives extracts alternatives from a lexer rule block +func (v *GrammarExtractorVisitor) extractLexerAlternatives(lexerRuleBlockCtx grammar.ILexerRuleBlockContext) []Alternative { + var alternatives []Alternative + + // Get lexer alternative list + lexerAltListCtx := lexerRuleBlockCtx.LexerAltList() + if lexerAltListCtx == nil { + return alternatives + } + + // Process each lexer alternative + for _, lexerAltCtx := range lexerAltListCtx.AllLexerAlt() { + alternative := v.extractLexerAlternative(lexerAltCtx) + alternatives = append(alternatives, alternative) + } + + return alternatives +} + +// extractLexerAlternative extracts a single lexer alternative +func (v *GrammarExtractorVisitor) extractLexerAlternative(lexerAltCtx grammar.ILexerAltContext) Alternative { + var elements []Element + + // Get lexer elements context + lexerElementsCtx := lexerAltCtx.LexerElements() + if lexerElementsCtx != nil { + // Process each lexer element + for _, lexerElementCtx := range lexerElementsCtx.AllLexerElement() { + element := v.extractLexerElement(lexerElementCtx) + if element != nil { + elements = append(elements, *element) + } + } + } + + return Alternative{ + Elements: elements, + } +} + // extractAlternative extracts a single alternative func (v *GrammarExtractorVisitor) extractAlternative(labeledAltCtx grammar.ILabeledAltContext) Alternative { var elements []Element @@ -387,6 +461,148 @@ func (v *GrammarExtractorVisitor) extractElement(elementCtx grammar.IElementCont return nil } +// extractLexerElement extracts a lexer element from a lexer element context +func (v *GrammarExtractorVisitor) extractLexerElement(lexerElementCtx grammar.ILexerElementContext) *Element { + // Handle lexer atoms (character ranges, terminals, etc.) + if lexerAtomCtx := lexerElementCtx.LexerAtom(); lexerAtomCtx != nil { + element := v.extractLexerAtom(lexerAtomCtx) + // Check for quantifiers + if element != nil { + element.Quantifier = v.extractQuantifier(lexerElementCtx.EbnfSuffix()) + } + return element + } + + // Handle lexer blocks (grouped alternatives) + if lexerBlockCtx := lexerElementCtx.LexerBlock(); lexerBlockCtx != nil { + element := v.extractLexerBlock(lexerBlockCtx) + // Check for quantifiers + if element != nil { + element.Quantifier = v.extractQuantifier(lexerElementCtx.EbnfSuffix()) + } + return element + } + + // Handle action blocks (for now, just return nil as they don't generate text) + if lexerElementCtx.ActionBlock() != nil { + // Action blocks don't contribute to generated text, so we skip them + return nil + } + + return nil +} + +// extractLexerAtom extracts a lexer atom (character ranges, terminals, etc.) +func (v *GrammarExtractorVisitor) extractLexerAtom(lexerAtomCtx grammar.ILexerAtomContext) *Element { + // Handle terminal definition (string literal or token reference) + if terminalDefCtx := lexerAtomCtx.TerminalDef(); terminalDefCtx != nil { + return v.extractTerminalDef(terminalDefCtx) + } + + // Handle character range (e.g., [a-z]) + if characterRangeCtx := lexerAtomCtx.CharacterRange(); characterRangeCtx != nil { + return v.extractCharacterRange(characterRangeCtx) + } + + // Handle not set (e.g., ~[abc]) + if notSetCtx := lexerAtomCtx.NotSet(); notSetCtx != nil { + return v.extractNotSet(notSetCtx) + } + + // Handle lexer character set (e.g., [abc]) + if lexerCharSetToken := lexerAtomCtx.LEXER_CHAR_SET(); lexerCharSetToken != nil { + return &Element{ + Value: LiteralValue{Text: lexerCharSetToken.GetText()}, + } + } + + // Handle wildcard (.) + if wildcardCtx := lexerAtomCtx.Wildcard(); wildcardCtx != nil { + return &Element{ + Value: WildcardValue{}, + } + } + + return nil +} + +// extractLexerBlock extracts a lexer block (grouped alternatives) +func (v *GrammarExtractorVisitor) extractLexerBlock(lexerBlockCtx grammar.ILexerBlockContext) *Element { + // Get the lexer alternative list from the block + lexerAltListCtx := lexerBlockCtx.LexerAltList() + if lexerAltListCtx == nil { + globalBlockID++ + blockID := fmt.Sprintf("lexer_block_%d_alts", globalBlockID) + emptyAlts := []Alternative{} + v.blockAltMap[blockID] = emptyAlts + + return &Element{ + Value: BlockValue{ID: blockID, Alternatives: emptyAlts}, + } + } + + // Extract all lexer alternatives from the block + lexerAlts := lexerAltListCtx.AllLexerAlt() + if len(lexerAlts) == 0 { + globalBlockID++ + blockID := fmt.Sprintf("lexer_block_%d_alts", globalBlockID) + emptyAlts := []Alternative{} + v.blockAltMap[blockID] = emptyAlts + + return &Element{ + Value: BlockValue{ID: blockID, Alternatives: emptyAlts}, + } + } + + // Extract all alternatives + blockAlternatives := []Alternative{} + for _, lexerAltCtx := range lexerAlts { + elements := []Element{} + if lexerElementsCtx := lexerAltCtx.LexerElements(); lexerElementsCtx != nil { + for _, lexerElementCtx := range lexerElementsCtx.AllLexerElement() { + element := v.extractLexerElement(lexerElementCtx) + if element != nil { + elements = append(elements, *element) + } + } + } + blockAlternatives = append(blockAlternatives, Alternative{Elements: elements}) + } + + // Generate global unique block ID and store mapping + globalBlockID++ + blockID := fmt.Sprintf("lexer_block_%d_alts", globalBlockID) + v.blockAltMap[blockID] = blockAlternatives + + return &Element{ + Value: BlockValue{ID: blockID, Alternatives: blockAlternatives}, + } +} + +// extractCharacterRange extracts a character range (e.g., 'a'..'z') +func (v *GrammarExtractorVisitor) extractCharacterRange(characterRangeCtx grammar.ICharacterRangeContext) *Element { + // Get the start and end of the range + stringLiterals := characterRangeCtx.AllSTRING_LITERAL() + if len(stringLiterals) == 2 { + startChar := stringLiterals[0].GetText() + endChar := stringLiterals[1].GetText() + rangeText := fmt.Sprintf("%s..%s", startChar, endChar) + return &Element{ + Value: LiteralValue{Text: rangeText}, + } + } + return nil +} + +// extractNotSet extracts a not set (e.g., ~[abc]) +func (v *GrammarExtractorVisitor) extractNotSet(notSetCtx grammar.INotSetContext) *Element { + // For now, represent as a literal text + // In a real implementation, this would need more sophisticated handling + return &Element{ + Value: LiteralValue{Text: "~[...]"}, + } +} + // extractLabeledElement extracts a labeled element (e.g., label=atom) func (v *GrammarExtractorVisitor) extractLabeledElement(labeledElementCtx grammar.ILabeledElementContext) *Element { // For now, just extract the atom part and ignore the label diff --git a/tools/fuzzing/internal/grammar/parser_test.go b/tools/fuzzing/internal/grammar/parser_test.go index 12ffa54..e3ee4cb 100644 --- a/tools/fuzzing/internal/grammar/parser_test.go +++ b/tools/fuzzing/internal/grammar/parser_test.go @@ -216,4 +216,195 @@ func createTempGrammarFile(t *testing.T, content string) string { } return tmpFile +} + +// TestLexerRuleParsing tests the parsing of lexer rules +func TestLexerRuleParsing(t *testing.T) { + grammarContent := ` +lexer grammar TestLexer; + +// Simple string literal +SELECT: 'SELECT'; + +// Character range +LETTER: [a-zA-Z]; + +// Complex rule with alternatives and quantifiers +IDENTIFIER: [a-zA-Z_] [a-zA-Z0-9_]*; + +// Rule with character set +DIGIT: [0-9]; + +// Rule with wildcard and quantifier +COMMENT: '//' .*? '\n'; +` + + tmpFile := createTempGrammarFile(t, grammarContent) + defer os.Remove(tmpFile) + + grammar, err := ParseGrammarFile(tmpFile) + if err != nil { + t.Fatalf("Failed to parse lexer grammar: %v", err) + } + + // Basic grammar properties + if grammar == nil { + t.Fatal("Grammar is nil") + } + if len(grammar.ParserRules) != 0 { + t.Errorf("Expected 0 parser rules, got %d", len(grammar.ParserRules)) + } + if len(grammar.LexerRules) != 5 { + t.Errorf("Expected 5 lexer rules, got %d", len(grammar.LexerRules)) + } + + // Test cases for lexer rule validation + tests := []struct { + ruleName string + alternatives int + elements []elementTest + }{ + { + ruleName: "SELECT", + alternatives: 1, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "'SELECT'", quantifier: NONE, elementType: "literal"}, + }, + }, + { + ruleName: "LETTER", + alternatives: 1, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "[a-zA-Z]", quantifier: NONE, elementType: "literal"}, + }, + }, + { + ruleName: "IDENTIFIER", + alternatives: 1, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "[a-zA-Z_]", quantifier: NONE, elementType: "literal"}, + {altIndex: 0, elementIndex: 1, value: "[a-zA-Z0-9_]", quantifier: ZERO_MORE, elementType: "literal"}, + }, + }, + { + ruleName: "DIGIT", + alternatives: 1, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "[0-9]", quantifier: NONE, elementType: "literal"}, + }, + }, + } + + for _, tc := range tests { + t.Run(tc.ruleName, func(t *testing.T) { + rule := grammar.GetRule(tc.ruleName) + if rule == nil { + t.Fatalf("rule %s not found", tc.ruleName) + } + if rule.Name != tc.ruleName || !rule.IsLexer { + t.Errorf("rule %s has incorrect metadata: IsLexer=%v", tc.ruleName, rule.IsLexer) + } + if len(rule.Alternatives) != tc.alternatives { + t.Errorf("%s: expected %d alternatives, got %d", tc.ruleName, tc.alternatives, len(rule.Alternatives)) + } + + for _, elem := range tc.elements { + altIndex := elem.altIndex + elementIndex := elem.elementIndex + + if altIndex >= len(rule.Alternatives) { + t.Errorf("%s: alternative %d out of range", tc.ruleName, altIndex) + continue + } + + elements := rule.Alternatives[altIndex].Elements + if elementIndex >= len(elements) { + t.Errorf("%s alt %d: element %d out of range", tc.ruleName, altIndex, elementIndex) + continue + } + + element := elements[elementIndex] + if elem.value != "" && element.Value.String() != elem.value { + t.Errorf("%s alt %d elem %d: expected value %s, got %s", tc.ruleName, altIndex, elementIndex, elem.value, element.Value.String()) + } + if element.Quantifier != elem.quantifier { + t.Errorf("%s alt %d elem %d: expected quantifier %v, got %v", tc.ruleName, altIndex, elementIndex, elem.quantifier, element.Quantifier) + } + + // Validate element type using type assertions + switch elem.elementType { + case "literal": + if _, ok := element.Value.(LiteralValue); !ok { + t.Errorf("%s alt %d elem %d: expected LiteralValue, got %T", tc.ruleName, altIndex, elementIndex, element.Value) + } + case "reference": + if _, ok := element.Value.(ReferenceValue); !ok { + t.Errorf("%s alt %d elem %d: expected ReferenceValue, got %T", tc.ruleName, altIndex, elementIndex, element.Value) + } + case "block": + if _, ok := element.Value.(BlockValue); !ok { + t.Errorf("%s alt %d elem %d: expected BlockValue, got %T", tc.ruleName, altIndex, elementIndex, element.Value) + } + } + } + }) + } +} + +// TestCombinedGrammarParsing tests parsing of combined grammar with both parser and lexer rules +func TestCombinedGrammarParsing(t *testing.T) { + grammarContent := ` +grammar CombinedTest; + +// Parser rules +statement: selectStmt; +selectStmt: 'SELECT' IDENTIFIER; + +// Lexer rules +IDENTIFIER: [a-zA-Z_] [a-zA-Z0-9_]*; +WS: [ \t\r\n]+ -> skip; +` + + tmpFile := createTempGrammarFile(t, grammarContent) + defer os.Remove(tmpFile) + + grammar, err := ParseGrammarFile(tmpFile) + if err != nil { + t.Fatalf("Failed to parse combined grammar: %v", err) + } + + // Basic grammar properties + if grammar == nil { + t.Fatal("Grammar is nil") + } + if len(grammar.ParserRules) != 2 { + t.Errorf("Expected 2 parser rules, got %d", len(grammar.ParserRules)) + } + if len(grammar.LexerRules) != 2 { + t.Errorf("Expected 2 lexer rules, got %d", len(grammar.LexerRules)) + } + + // Test parser rule + statement := grammar.GetRule("statement") + if statement == nil { + t.Fatal("Parser rule 'statement' not found") + } + if statement.IsLexer { + t.Error("Parser rule incorrectly marked as lexer rule") + } + + // Test lexer rule + identifier := grammar.GetRule("IDENTIFIER") + if identifier == nil { + t.Fatal("Lexer rule 'IDENTIFIER' not found") + } + if !identifier.IsLexer { + t.Error("Lexer rule incorrectly marked as parser rule") + } + + // Test that GetAllRules returns both types + allRules := grammar.GetAllRules() + if len(allRules) != 4 { + t.Errorf("Expected 4 total rules, got %d", len(allRules)) + } } \ No newline at end of file From 192f56628f14535c334fff8b049480249bdcdf6b Mon Sep 17 00:00:00 2001 From: h3n4l Date: Thu, 28 Aug 2025 11:18:38 +0800 Subject: [PATCH 5/9] feat: generator for lexer rules --- .../fuzzing/internal/lexer/token_generator.go | 351 ++++++++++++++++++ .../internal/lexer/token_generator_test.go | 344 +++++++++++++++++ 2 files changed, 695 insertions(+) create mode 100644 tools/fuzzing/internal/lexer/token_generator.go create mode 100644 tools/fuzzing/internal/lexer/token_generator_test.go diff --git a/tools/fuzzing/internal/lexer/token_generator.go b/tools/fuzzing/internal/lexer/token_generator.go new file mode 100644 index 0000000..c201152 --- /dev/null +++ b/tools/fuzzing/internal/lexer/token_generator.go @@ -0,0 +1,351 @@ +package lexer + +import ( + "fmt" + "math/rand" + "regexp" + "strings" + + "github.com/bytebase/parser/tools/fuzzing/internal/grammar" +) + +// TokenGenerator generates tokens from lexer rules +type TokenGenerator struct { + random *rand.Rand + config *TokenGeneratorConfig +} + +// TokenGeneratorConfig controls token generation behavior +type TokenGeneratorConfig struct { + // MaxQuantifierCount limits how many times quantified elements repeat + MaxQuantifierCount int + // MinQuantifierCount sets minimum repetitions for + quantifiers + MinQuantifierCount int + // OptionalProbability controls likelihood of including optional elements (0.0-1.0) + OptionalProbability float64 + // MaxDepth limits recursion depth to prevent infinite loops + MaxDepth int +} + +// NewTokenGenerator creates a new token generator +func NewTokenGenerator(seed int64, config *TokenGeneratorConfig) *TokenGenerator { + if config == nil { + config = &TokenGeneratorConfig{ + MaxQuantifierCount: 5, + MinQuantifierCount: 1, + OptionalProbability: 0.7, + MaxDepth: 10, + } + } + return &TokenGenerator{ + random: rand.New(rand.NewSource(seed)), + config: config, + } +} + +// GenerateToken generates a token string from a lexer rule +func (g *TokenGenerator) GenerateToken(rule *grammar.Rule) (string, error) { + if !rule.IsLexer { + return "", fmt.Errorf("rule %s is not a lexer rule", rule.Name) + } + + if len(rule.Alternatives) == 0 { + return "", fmt.Errorf("rule %s has no alternatives", rule.Name) + } + + // Select a random alternative + altIndex := g.random.Intn(len(rule.Alternatives)) + alternative := rule.Alternatives[altIndex] + + // Generate from the selected alternative + return g.generateFromAlternative(&alternative, 0) +} + +// generateFromAlternative generates text from a lexer rule alternative +func (g *TokenGenerator) generateFromAlternative(alt *grammar.Alternative, depth int) (string, error) { + if depth > g.config.MaxDepth { + return "", fmt.Errorf("maximum depth exceeded") + } + + var result strings.Builder + for _, element := range alt.Elements { + text, err := g.generateFromElement(&element, depth+1) + if err != nil { + return "", err + } + result.WriteString(text) + } + return result.String(), nil +} + +// generateFromElement generates text from a single lexer element +func (g *TokenGenerator) generateFromElement(element *grammar.Element, depth int) (string, error) { + if depth > g.config.MaxDepth { + return "", fmt.Errorf("maximum depth exceeded") + } + + // Handle quantifiers + switch element.Quantifier { + case grammar.OPTIONAL_Q: // ? + if g.random.Float64() > g.config.OptionalProbability { + return "", nil // Skip optional element + } + return g.generateElementContent(element, depth) + + case grammar.ZERO_MORE: // * + count := g.random.Intn(g.config.MaxQuantifierCount + 1) // 0 to MaxQuantifierCount + return g.generateRepeated(element, count, depth) + + case grammar.ONE_MORE: // + + count := g.config.MinQuantifierCount + g.random.Intn(g.config.MaxQuantifierCount) + return g.generateRepeated(element, count, depth) + + default: // NONE + return g.generateElementContent(element, depth) + } +} + +// generateRepeated generates repeated content for quantified elements +func (g *TokenGenerator) generateRepeated(element *grammar.Element, count int, depth int) (string, error) { + var result strings.Builder + for i := 0; i < count; i++ { + text, err := g.generateElementContent(element, depth) + if err != nil { + return "", err + } + result.WriteString(text) + } + return result.String(), nil +} + +// generateElementContent generates the actual content for an element +func (g *TokenGenerator) generateElementContent(element *grammar.Element, depth int) (string, error) { + switch value := element.Value.(type) { + case grammar.LiteralValue: + return g.generateFromLiteral(value) + case grammar.BlockValue: + return g.generateFromBlock(value, depth) + case grammar.WildcardValue: + return g.generateFromWildcard() + case grammar.ReferenceValue: + // For lexer rules, this typically shouldn't happen unless it's a fragment reference + // For now, return the reference name as placeholder + return fmt.Sprintf("<%s>", value.Name), nil + default: + return "", fmt.Errorf("unsupported element value type: %T", value) + } +} + +// generateFromLiteral generates text from a literal value +func (g *TokenGenerator) generateFromLiteral(literal grammar.LiteralValue) (string, error) { + text := literal.Text + + // Handle string literals - remove quotes + if len(text) >= 2 && text[0] == '\'' && text[len(text)-1] == '\'' { + return text[1 : len(text)-1], nil + } + + // Handle negated sets like ~[...] FIRST (before checking for ..) + if strings.HasPrefix(text, "~[") && strings.HasSuffix(text, "]") { + return g.generateFromNegatedSet(text) + } + + // Handle character sets like [a-zA-Z] + if len(text) >= 2 && text[0] == '[' && text[len(text)-1] == ']' { + return g.generateFromCharacterSet(text[1 : len(text)-1]) + } + + // Handle character ranges like 'a'..'z' + if strings.Contains(text, "..") { + return g.generateFromCharacterRange(text) + } + + // Default: return the literal as-is + return text, nil +} + +// generateFromCharacterSet generates a character from a character set like [a-zA-Z0-9_] +func (g *TokenGenerator) generateFromCharacterSet(charset string) (string, error) { + chars, err := g.expandCharacterSet(charset) + if err != nil { + return "", err + } + if len(chars) == 0 { + return "", fmt.Errorf("empty character set") + } + + // Select a random character from the set + index := g.random.Intn(len(chars)) + return string(chars[index]), nil +} + +// expandCharacterSet expands a character set specification into actual characters +func (g *TokenGenerator) expandCharacterSet(charset string) ([]rune, error) { + var chars []rune + i := 0 + + for i < len(charset) { + // Handle escape sequences + if i < len(charset) && charset[i] == '\\' && i+1 < len(charset) { + switch charset[i+1] { + case 'r': + chars = append(chars, '\r') + case 'n': + chars = append(chars, '\n') + case 't': + chars = append(chars, '\t') + case '\\': + chars = append(chars, '\\') + case '"': + chars = append(chars, '"') + case '\'': + chars = append(chars, '\'') + default: + // For unknown escapes, use the escaped character literally + chars = append(chars, rune(charset[i+1])) + } + i += 2 + } else if i+2 < len(charset) && charset[i+1] == '-' && charset[i+2] != '\\' { + // Handle range like a-z (but not when second char is an escape) + start := rune(charset[i]) + end := rune(charset[i+2]) + + if start > end { + return nil, fmt.Errorf("invalid character range: %c-%c", start, end) + } + + for c := start; c <= end; c++ { + chars = append(chars, c) + } + i += 3 + } else { + // Handle single character + chars = append(chars, rune(charset[i])) + i++ + } + } + + return chars, nil +} + +// generateFromCharacterRange generates from a character range like 'a'..'z' +func (g *TokenGenerator) generateFromCharacterRange(rangeText string) (string, error) { + // Extract start and end characters from 'a'..'z' format + parts := strings.Split(rangeText, "..") + if len(parts) != 2 { + return "", fmt.Errorf("invalid character range format: %s", rangeText) + } + + start := strings.Trim(parts[0], "'\"") + end := strings.Trim(parts[1], "'\"") + + if len(start) != 1 || len(end) != 1 { + return "", fmt.Errorf("character range must be single characters: %s", rangeText) + } + + startChar := rune(start[0]) + endChar := rune(end[0]) + + if startChar > endChar { + return "", fmt.Errorf("invalid character range: %c > %c", startChar, endChar) + } + + // Generate random character in range + rangeSize := int(endChar - startChar + 1) + offset := g.random.Intn(rangeSize) + result := startChar + rune(offset) + + return string(result), nil +} + +// generateFromBlock generates text from a block value +func (g *TokenGenerator) generateFromBlock(block grammar.BlockValue, depth int) (string, error) { + if len(block.Alternatives) == 0 { + return "", nil + } + + // Select a random alternative from the block + altIndex := g.random.Intn(len(block.Alternatives)) + alternative := &block.Alternatives[altIndex] + + return g.generateFromAlternative(alternative, depth) +} + +// generateFromWildcard generates a character for wildcard (.) +func (g *TokenGenerator) generateFromWildcard() (string, error) { + // Generate a random printable ASCII character + // Range: 32-126 (space to tilde) + char := rune(32 + g.random.Intn(95)) + return string(char), nil +} + +// generateFromNegatedSet generates a character NOT in the specified set +func (g *TokenGenerator) generateFromNegatedSet(negatedSet string) (string, error) { + // Extract the character set from ~[...] format + if len(negatedSet) < 4 || !strings.HasPrefix(negatedSet, "~[") || !strings.HasSuffix(negatedSet, "]") { + return "", fmt.Errorf("invalid negated set format: %s", negatedSet) + } + + charset := negatedSet[2 : len(negatedSet)-1] // Remove ~[ and ] + + // Expand the excluded character set + excludedChars, err := g.expandCharacterSet(charset) + if err != nil { + return "", fmt.Errorf("failed to expand excluded character set: %v", err) + } + + // Create a map for quick lookup + excluded := make(map[rune]bool) + for _, c := range excludedChars { + excluded[c] = true + } + + // Generate a character that's not in the excluded set + // Try common printable ASCII characters first + candidates := []rune{} + + // Add letters + for c := 'a'; c <= 'z'; c++ { + if !excluded[c] { + candidates = append(candidates, c) + } + } + for c := 'A'; c <= 'Z'; c++ { + if !excluded[c] { + candidates = append(candidates, c) + } + } + + // Add digits + for c := '0'; c <= '9'; c++ { + if !excluded[c] { + candidates = append(candidates, c) + } + } + + // Add some special characters + specialChars := []rune{' ', '!', '#', '$', '%', '&', '*', '+', '/', '=', '?', '@', '^', '_', '`', '|', '~'} + for _, c := range specialChars { + if !excluded[c] { + candidates = append(candidates, c) + } + } + + if len(candidates) == 0 { + return "", fmt.Errorf("no valid characters available (all excluded)") + } + + // Select a random candidate + index := g.random.Intn(len(candidates)) + return string(candidates[index]), nil +} + +// ValidateCharacterSet validates if a character set specification is valid +func ValidateCharacterSet(charset string) error { + // Use regex to validate basic character set patterns + validPattern := regexp.MustCompile(`^[a-zA-Z0-9_\-\[\]\\^]+$`) + if !validPattern.MatchString(charset) { + return fmt.Errorf("invalid characters in character set: %s", charset) + } + return nil +} \ No newline at end of file diff --git a/tools/fuzzing/internal/lexer/token_generator_test.go b/tools/fuzzing/internal/lexer/token_generator_test.go new file mode 100644 index 0000000..c944268 --- /dev/null +++ b/tools/fuzzing/internal/lexer/token_generator_test.go @@ -0,0 +1,344 @@ +package lexer + +import ( + "os" + "path/filepath" + "regexp" + "strings" + "testing" + + "github.com/bytebase/parser/tools/fuzzing/internal/grammar" +) + +// TestTokenGeneratorBasic tests basic token generation functionality +func TestTokenGeneratorBasic(t *testing.T) { + config := &TokenGeneratorConfig{ + MaxQuantifierCount: 3, + MinQuantifierCount: 1, + OptionalProbability: 1.0, // Always include optional elements for testing + MaxDepth: 5, + } + generator := NewTokenGenerator(12345, config) + + tests := []struct { + ruleName string + grammarText string + validator func(string) bool + description string + }{ + { + ruleName: "SELECT", + grammarText: "SELECT: 'SELECT';", + validator: func(s string) bool { return s == "SELECT" }, + description: "simple string literal", + }, + { + ruleName: "LETTER", + grammarText: "LETTER: [a-z];", + validator: func(s string) bool { return len(s) == 1 && s[0] >= 'a' && s[0] <= 'z' }, + description: "single character range", + }, + { + ruleName: "DIGIT", + grammarText: "DIGIT: [0-9];", + validator: func(s string) bool { return len(s) == 1 && s[0] >= '0' && s[0] <= '9' }, + description: "digit character range", + }, + { + ruleName: "IDENTIFIER", + grammarText: "IDENTIFIER: [a-zA-Z_] [a-zA-Z0-9_]*;", + validator: func(s string) bool { + if len(s) == 0 { + return false + } + // First character must be letter or underscore + first := s[0] + if !((first >= 'a' && first <= 'z') || (first >= 'A' && first <= 'Z') || first == '_') { + return false + } + // Rest must be letters, digits, or underscore + for _, c := range s[1:] { + if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_') { + return false + } + } + return true + }, + description: "identifier with quantifier", + }, + } + + for _, tt := range tests { + t.Run(tt.description, func(t *testing.T) { + // Create a temporary grammar file + grammarContent := "lexer grammar Test;\n\n" + tt.grammarText + tmpFile := createTempGrammarFile(t, grammarContent) + defer os.Remove(tmpFile) + + // Parse the grammar + parsedGrammar, err := grammar.ParseGrammarFile(tmpFile) + if err != nil { + t.Fatalf("Failed to parse grammar: %v", err) + } + + // Get the rule + rule := parsedGrammar.GetRule(tt.ruleName) + if rule == nil { + t.Fatalf("Rule %s not found", tt.ruleName) + } + + // Generate multiple tokens to test consistency + for i := 0; i < 10; i++ { + token, err := generator.GenerateToken(rule) + if err != nil { + t.Errorf("Failed to generate token: %v", err) + continue + } + + if !tt.validator(token) { + t.Errorf("Generated token '%s' does not match expected pattern for %s", token, tt.description) + } + } + }) + } +} + +// TestQuantifierHandling tests EBNF quantifier handling +func TestQuantifierHandling(t *testing.T) { + config := &TokenGeneratorConfig{ + MaxQuantifierCount: 5, + MinQuantifierCount: 2, + OptionalProbability: 0.5, + MaxDepth: 5, + } + generator := NewTokenGenerator(54321, config) + + tests := []struct { + ruleName string + grammarText string + validator func(string) bool + description string + }{ + { + ruleName: "OPTIONAL", + grammarText: "OPTIONAL: 'A' 'B'?;", + validator: func(s string) bool { + return s == "A" || s == "AB" + }, + description: "optional element with ?", + }, + { + ruleName: "ZERO_MORE", + grammarText: "ZERO_MORE: 'X' 'Y'*;", + validator: func(s string) bool { + if !strings.HasPrefix(s, "X") { + return false + } + rest := s[1:] + for _, c := range rest { + if c != 'Y' { + return false + } + } + return true + }, + description: "zero or more with *", + }, + { + ruleName: "ONE_MORE", + grammarText: "ONE_MORE: 'Z' 'W'+;", + validator: func(s string) bool { + if !strings.HasPrefix(s, "Z") { + return false + } + rest := s[1:] + if len(rest) == 0 { + return false // + requires at least one + } + for _, c := range rest { + if c != 'W' { + return false + } + } + return true + }, + description: "one or more with +", + }, + } + + for _, tt := range tests { + t.Run(tt.description, func(t *testing.T) { + // Create a temporary grammar file + grammarContent := "lexer grammar Test;\n\n" + tt.grammarText + tmpFile := createTempGrammarFile(t, grammarContent) + defer os.Remove(tmpFile) + + // Parse the grammar + parsedGrammar, err := grammar.ParseGrammarFile(tmpFile) + if err != nil { + t.Fatalf("Failed to parse grammar: %v", err) + } + + // Get the rule + rule := parsedGrammar.GetRule(tt.ruleName) + if rule == nil { + t.Fatalf("Rule %s not found", tt.ruleName) + } + + // Generate multiple tokens to test quantifier behavior + validCount := 0 + for i := 0; i < 20; i++ { + token, err := generator.GenerateToken(rule) + if err != nil { + t.Errorf("Failed to generate token: %v", err) + continue + } + + if tt.validator(token) { + validCount++ + } else { + t.Logf("Generated token '%s' for %s (validation failed but continuing)", token, tt.description) + } + } + + // At least 50% of generated tokens should be valid + if validCount < 10 { + t.Errorf("Too few valid tokens generated (%d/20) for %s", validCount, tt.description) + } + }) + } +} + +// TestCharacterSetExpansion tests character set expansion functionality +func TestCharacterSetExpansion(t *testing.T) { + generator := NewTokenGenerator(9999, nil) + + tests := []struct { + charset string + expected []rune + }{ + {"abc", []rune{'a', 'b', 'c'}}, + {"a-c", []rune{'a', 'b', 'c'}}, + {"0-2", []rune{'0', '1', '2'}}, + {"a-cX", []rune{'a', 'b', 'c', 'X'}}, + {"A-Z_", append(makeRange('A', 'Z'), '_')}, + } + + for _, tt := range tests { + t.Run(tt.charset, func(t *testing.T) { + result, err := generator.expandCharacterSet(tt.charset) + if err != nil { + t.Fatalf("Failed to expand character set '%s': %v", tt.charset, err) + } + + if len(result) != len(tt.expected) { + t.Errorf("Expected %d characters, got %d", len(tt.expected), len(result)) + return + } + + for i, expected := range tt.expected { + if result[i] != expected { + t.Errorf("At position %d: expected '%c', got '%c'", i, expected, result[i]) + } + } + }) + } +} + +// TestComplexLexerRules tests complex lexer rules with multiple elements +func TestComplexLexerRules(t *testing.T) { + config := &TokenGeneratorConfig{ + MaxQuantifierCount: 3, + MinQuantifierCount: 1, + OptionalProbability: 0.8, + MaxDepth: 10, + } + generator := NewTokenGenerator(11111, config) + + grammarContent := ` +lexer grammar ComplexTest; + +// Complex identifier rule +IDENTIFIER: [a-zA-Z_] [a-zA-Z0-9_]*; + +// Number with optional decimal part +NUMBER: [0-9]+ ('.' [0-9]+)?; + +// String with escaped quotes +STRING: '"' (~'"')* '"'; + +// Comment line +COMMENT: '//' (~[\r\n])*; +` + + tmpFile := createTempGrammarFile(t, grammarContent) + defer os.Remove(tmpFile) + + parsedGrammar, err := grammar.ParseGrammarFile(tmpFile) + if err != nil { + t.Fatalf("Failed to parse complex grammar: %v", err) + } + + tests := []struct { + ruleName string + pattern string + }{ + {"IDENTIFIER", `^[a-zA-Z_][a-zA-Z0-9_]*$`}, + {"NUMBER", `^[0-9]+(\.[0-9]+)?$`}, + {"STRING", `^"[^"]*"$`}, + {"COMMENT", `^//.*$`}, + } + + for _, tt := range tests { + t.Run(tt.ruleName, func(t *testing.T) { + rule := parsedGrammar.GetRule(tt.ruleName) + if rule == nil { + t.Fatalf("Rule %s not found", tt.ruleName) + } + + regex := regexp.MustCompile(tt.pattern) + validCount := 0 + + for i := 0; i < 10; i++ { + token, err := generator.GenerateToken(rule) + if err != nil { + t.Errorf("Failed to generate token for %s: %v", tt.ruleName, err) + continue + } + + t.Logf("Generated token for %s: '%s'", tt.ruleName, token) + + if regex.MatchString(token) { + validCount++ + } + } + + // Expect at least some valid tokens + if validCount == 0 { + t.Errorf("No valid tokens generated for %s", tt.ruleName) + } + }) + } +} + +// Helper functions + +func createTempGrammarFile(t *testing.T, content string) string { + tmpDir := os.TempDir() + tmpFile := filepath.Join(tmpDir, "test_lexer.g4") + + err := os.WriteFile(tmpFile, []byte(content), 0644) + if err != nil { + t.Fatalf("Failed to create temp grammar file: %v", err) + } + + return tmpFile +} + +func makeRange(start, end rune) []rune { + var result []rune + for c := start; c <= end; c++ { + result = append(result, c) + } + return result +} \ No newline at end of file From 5692d2125d4efc711df12e00e64b8b6cd55f51aa Mon Sep 17 00:00:00 2001 From: h3n4l Date: Thu, 28 Aug 2025 11:31:23 +0800 Subject: [PATCH 6/9] chore: merge grammars --- tools/fuzzing/internal/generator/generator.go | 45 ++---- tools/fuzzing/internal/grammar/parser.go | 62 ++++++++ tools/fuzzing/internal/grammar/parser_test.go | 147 ++++++++++++++++++ 3 files changed, 223 insertions(+), 31 deletions(-) diff --git a/tools/fuzzing/internal/generator/generator.go b/tools/fuzzing/internal/generator/generator.go index 207c677..a4b7840 100644 --- a/tools/fuzzing/internal/generator/generator.go +++ b/tools/fuzzing/internal/generator/generator.go @@ -11,9 +11,9 @@ import ( // Generator handles the fuzzing logic type Generator struct { - config *config.Config - random *rand.Rand - grammars []*grammar.ParsedGrammar + config *config.Config + random *rand.Rand + grammar *grammar.ParsedGrammar } // New creates a new generator with the given configuration @@ -28,20 +28,18 @@ func New(cfg *config.Config) *Generator { func (g *Generator) Generate() error { fmt.Println("Initializing grammar parser...") - // Parse all grammar files - g.grammars = make([]*grammar.ParsedGrammar, len(g.config.GrammarFiles)) - for i, filePath := range g.config.GrammarFiles { - parsedGrammar, err := grammar.ParseGrammarFile(filePath) - if err != nil { - return errors.Wrapf(err, "failed to parse grammar file %s", filePath) - } - g.grammars[i] = parsedGrammar - fmt.Printf("Parsed grammar file: %s\n", filePath) + // Parse and merge all grammar files into a single grammar + var err error + g.grammar, err = grammar.ParseAndMergeGrammarFiles(g.config.GrammarFiles) + if err != nil { + return errors.Wrap(err, "failed to parse and merge grammar files") } + + fmt.Printf("Parsed and merged %d grammar files into single grammar\n", len(g.config.GrammarFiles)) // Validate start rule exists - if !g.hasRule(g.config.StartRule) { - return errors.Errorf("start rule '%s' not found in any grammar file", g.config.StartRule) + if g.grammar.GetRule(g.config.StartRule) == nil { + return errors.Errorf("start rule '%s' not found in merged grammar", g.config.StartRule) } fmt.Printf("Generating %d queries from rule '%s'...\n", g.config.Count, g.config.StartRule) @@ -55,24 +53,9 @@ func (g *Generator) Generate() error { return nil } -// hasRule checks if a rule exists in any of the parsed grammars -func (g *Generator) hasRule(ruleName string) bool { - for _, grammar := range g.grammars { - if grammar.GetRule(ruleName) != nil { - return true - } - } - return false -} - -// getRule gets a rule from any of the parsed grammars +// getRule gets a rule from the merged grammar func (g *Generator) getRule(ruleName string) *grammar.Rule { - for _, grammar := range g.grammars { - if rule := grammar.GetRule(ruleName); rule != nil { - return rule - } - } - return nil + return g.grammar.GetRule(ruleName) } // generateQuery creates a single query using grammar rules diff --git a/tools/fuzzing/internal/grammar/parser.go b/tools/fuzzing/internal/grammar/parser.go index 70ceec2..cd43d1c 100644 --- a/tools/fuzzing/internal/grammar/parser.go +++ b/tools/fuzzing/internal/grammar/parser.go @@ -191,6 +191,68 @@ func (g *ParsedGrammar) IsGeneratedBlock(name string) bool { return exists } +// MergeGrammar merges another grammar into this one +func (g *ParsedGrammar) MergeGrammar(other *ParsedGrammar) error { + // Merge lexer rules + for name, rule := range other.LexerRules { + if _, exists := g.LexerRules[name]; exists { + return fmt.Errorf("duplicate lexer rule '%s' found in grammars '%s' and '%s'", name, g.FilePath, other.FilePath) + } + g.LexerRules[name] = rule + } + + // Merge parser rules + for name, rule := range other.ParserRules { + if _, exists := g.ParserRules[name]; exists { + return fmt.Errorf("duplicate parser rule '%s' found in grammars '%s' and '%s'", name, g.FilePath, other.FilePath) + } + g.ParserRules[name] = rule + } + + // Merge block alternatives map + for blockID, alternatives := range other.BlockAltMap { + if _, exists := g.BlockAltMap[blockID]; exists { + return fmt.Errorf("duplicate block ID '%s' found in grammars '%s' and '%s'", blockID, g.FilePath, other.FilePath) + } + g.BlockAltMap[blockID] = alternatives + } + + // Update file path to indicate it's a merged grammar + if g.FilePath != other.FilePath { + g.FilePath = fmt.Sprintf("%s + %s", g.FilePath, other.FilePath) + } + + return nil +} + +// ParseAndMergeGrammarFiles parses multiple grammar files and merges them into a single ParsedGrammar +func ParseAndMergeGrammarFiles(filePaths []string) (*ParsedGrammar, error) { + if len(filePaths) == 0 { + return nil, errors.New("no grammar files provided") + } + + // Parse the first grammar file + mergedGrammar, err := ParseGrammarFile(filePaths[0]) + if err != nil { + return nil, errors.Wrapf(err, "failed to parse first grammar file %s", filePaths[0]) + } + + // Merge additional grammar files + for i := 1; i < len(filePaths); i++ { + filePath := filePaths[i] + grammar, err := ParseGrammarFile(filePath) + if err != nil { + return nil, errors.Wrapf(err, "failed to parse grammar file %s", filePath) + } + + if err := mergedGrammar.MergeGrammar(grammar); err != nil { + return nil, errors.Wrapf(err, "failed to merge grammar file %s", filePath) + } + } + + return mergedGrammar, nil +} + // IsRule checks if an element refers to another rule or generated block func (e *Element) IsRule() bool { _, isRef := e.Value.(ReferenceValue) diff --git a/tools/fuzzing/internal/grammar/parser_test.go b/tools/fuzzing/internal/grammar/parser_test.go index e3ee4cb..15cb127 100644 --- a/tools/fuzzing/internal/grammar/parser_test.go +++ b/tools/fuzzing/internal/grammar/parser_test.go @@ -3,6 +3,7 @@ package grammar import ( "os" "path/filepath" + "strings" "testing" ) @@ -218,6 +219,18 @@ func createTempGrammarFile(t *testing.T, content string) string { return tmpFile } +func createTempGrammarFileWithName(t *testing.T, content string, filename string) string { + tmpDir := os.TempDir() + tmpFile := filepath.Join(tmpDir, filename) + + err := os.WriteFile(tmpFile, []byte(content), 0644) + if err != nil { + t.Fatalf("Failed to create temp grammar file: %v", err) + } + + return tmpFile +} + // TestLexerRuleParsing tests the parsing of lexer rules func TestLexerRuleParsing(t *testing.T) { grammarContent := ` @@ -407,4 +420,138 @@ WS: [ \t\r\n]+ -> skip; if len(allRules) != 4 { t.Errorf("Expected 4 total rules, got %d", len(allRules)) } +} + +// TestGrammarMerging tests merging multiple grammar files +func TestGrammarMerging(t *testing.T) { + // Create first grammar file (parser rules) + parserGrammarContent := ` +parser grammar ParserTest; + +options { + tokenVocab = LexerTest; +} + +statement: selectStmt; +selectStmt: 'SELECT' IDENTIFIER; +` + + // Create second grammar file (lexer rules) + lexerGrammarContent := ` +lexer grammar LexerTest; + +IDENTIFIER: [a-zA-Z_] [a-zA-Z0-9_]*; +WS: [ \t\r\n]+ -> skip; +` + + // Create temporary files with unique names + tmpParserFile := createTempGrammarFileWithName(t, parserGrammarContent, "test_parser.g4") + defer os.Remove(tmpParserFile) + + tmpLexerFile := createTempGrammarFileWithName(t, lexerGrammarContent, "test_lexer.g4") + defer os.Remove(tmpLexerFile) + + // Test parsing and merging + filePaths := []string{tmpParserFile, tmpLexerFile} + mergedGrammar, err := ParseAndMergeGrammarFiles(filePaths) + if err != nil { + t.Fatalf("Failed to parse and merge grammar files: %v", err) + } + + // Verify merged grammar properties + if mergedGrammar == nil { + t.Fatal("Merged grammar is nil") + } + + if len(mergedGrammar.ParserRules) != 2 { + t.Errorf("Expected 2 parser rules, got %d", len(mergedGrammar.ParserRules)) + } + + if len(mergedGrammar.LexerRules) != 2 { + t.Errorf("Expected 2 lexer rules, got %d", len(mergedGrammar.LexerRules)) + } + + // Test that both parser and lexer rules are accessible + statement := mergedGrammar.GetRule("statement") + if statement == nil || statement.IsLexer { + t.Error("Parser rule 'statement' not found or incorrectly marked") + } + + identifier := mergedGrammar.GetRule("IDENTIFIER") + if identifier == nil || !identifier.IsLexer { + t.Error("Lexer rule 'IDENTIFIER' not found or incorrectly marked") + } + + // Test that merged path is updated + if !strings.Contains(mergedGrammar.FilePath, "+") { + t.Errorf("Expected merged file path to contain '+', got: %s", mergedGrammar.FilePath) + } + + // Test GetAllRules on merged grammar + allRules := mergedGrammar.GetAllRules() + if len(allRules) != 4 { + t.Errorf("Expected 4 total rules in merged grammar, got %d", len(allRules)) + } +} + +// TestGrammarMergingWithConflicts tests handling of duplicate rule names +func TestGrammarMergingWithConflicts(t *testing.T) { + // Create two grammars with conflicting rule names + grammar1Content := ` +lexer grammar Test1; +IDENTIFIER: [a-z]+; +` + + grammar2Content := ` +lexer grammar Test2; +IDENTIFIER: [A-Z]+; // Conflict with first grammar +` + + tmpFile1 := createTempGrammarFileWithName(t, grammar1Content, "conflict1.g4") + defer os.Remove(tmpFile1) + + tmpFile2 := createTempGrammarFileWithName(t, grammar2Content, "conflict2.g4") + defer os.Remove(tmpFile2) + + // Test that merging fails with duplicate rule names + filePaths := []string{tmpFile1, tmpFile2} + _, err := ParseAndMergeGrammarFiles(filePaths) + if err == nil { + t.Error("Expected error when merging grammars with duplicate rule names") + } + + if !strings.Contains(err.Error(), "duplicate") { + t.Errorf("Expected error about duplicate rules, got: %v", err) + } +} + +// TestParseAndMergeGrammarFilesEdgeCases tests edge cases +func TestParseAndMergeGrammarFilesEdgeCases(t *testing.T) { + // Test with empty file list + _, err := ParseAndMergeGrammarFiles([]string{}) + if err == nil { + t.Error("Expected error with empty file list") + } + + // Test with single file + grammarContent := ` +lexer grammar SingleTest; +TOKEN: 'test'; +` + + tmpFile := createTempGrammarFileWithName(t, grammarContent, "single.g4") + defer os.Remove(tmpFile) + + grammar, err := ParseAndMergeGrammarFiles([]string{tmpFile}) + if err != nil { + t.Fatalf("Failed to parse single grammar file: %v", err) + } + + if len(grammar.LexerRules) != 1 { + t.Errorf("Expected 1 lexer rule, got %d", len(grammar.LexerRules)) + } + + if grammar.GetRule("TOKEN") == nil { + t.Error("TOKEN rule not found in single file grammar") + } } \ No newline at end of file From 5659f1c09b90b68099015f57e835543c7beea46a Mon Sep 17 00:00:00 2001 From: h3n4l Date: Thu, 28 Aug 2025 11:32:43 +0800 Subject: [PATCH 7/9] chore: remove list grammar options --- tools/fuzzing/cmd/fuzzer/main.go | 49 +++++++------------------------- 1 file changed, 11 insertions(+), 38 deletions(-) diff --git a/tools/fuzzing/cmd/fuzzer/main.go b/tools/fuzzing/cmd/fuzzer/main.go index dd62d7e..c388520 100644 --- a/tools/fuzzing/cmd/fuzzer/main.go +++ b/tools/fuzzing/cmd/fuzzer/main.go @@ -9,33 +9,31 @@ import ( "github.com/bytebase/parser/tools/fuzzing/internal/config" "github.com/bytebase/parser/tools/fuzzing/internal/generator" - "github.com/bytebase/parser/tools/fuzzing/internal/grammar" ) func main() { cfg := parseFlags() - + if err := cfg.Validate(); err != nil { fmt.Fprintf(os.Stderr, "Error: %v\n", err) os.Exit(1) } - + cfg.Print() - + gen := generator.New(cfg) if err := gen.Generate(); err != nil { fmt.Fprintf(os.Stderr, "Generation failed: %v\n", err) os.Exit(1) } - + fmt.Println("Generation completed successfully!") } func parseFlags() *config.Config { cfg := &config.Config{} - var listGrammars bool var grammarArg string - + flag.StringVar(&grammarArg, "grammar", "", "Grammar file(s): single file or comma-separated lexer,parser files") flag.StringVar(&cfg.StartRule, "start-rule", "", "Starting grammar rule name") flag.IntVar(&cfg.Count, "count", 10, "Number of queries to generate") @@ -46,18 +44,15 @@ func parseFlags() *config.Config { flag.IntVar(&cfg.QuantifierCount, "quantifier-count", 0, "Fixed count for all quantifiers (overrides min/max)") flag.StringVar(&cfg.Output, "output", "", "Output file path (default: stdout)") flag.Int64Var(&cfg.Seed, "seed", time.Now().UnixNano(), "Random seed for reproducible generation") - flag.BoolVar(&listGrammars, "list-grammars", false, "List all available grammars and exit") - + // Custom usage message flag.Usage = func() { fmt.Fprintf(os.Stderr, "Grammar-Aware Fuzzing Tool\n\n") fmt.Fprintf(os.Stderr, "Usage: %s [options]\n\n", os.Args[0]) fmt.Fprintf(os.Stderr, "Options:\n") flag.PrintDefaults() - + fmt.Fprintf(os.Stderr, "\nExamples:\n") - fmt.Fprintf(os.Stderr, " # List available grammars\n") - fmt.Fprintf(os.Stderr, " %s --list-grammars\n\n", os.Args[0]) fmt.Fprintf(os.Stderr, " # Single combined grammar file\n") fmt.Fprintf(os.Stderr, " %s --grammar combined.g4 --start-rule selectStmt --count 10\n\n", os.Args[0]) fmt.Fprintf(os.Stderr, " # Separate lexer and parser files\n") @@ -69,31 +64,9 @@ func parseFlags() *config.Config { fmt.Fprintf(os.Stderr, " # Output to file\n") fmt.Fprintf(os.Stderr, " %s --grammar postgresql/PostgreSQLLexer.g4,postgresql/PostgreSQLParser.g4 --start-rule selectStmt --count 100 --output queries.sql\n\n", os.Args[0]) } - + flag.Parse() - - // Handle --list-grammars - if listGrammars { - grammars, err := grammar.ListAvailableGrammars() - if err != nil { - fmt.Fprintf(os.Stderr, "Error listing grammars: %v\n", err) - os.Exit(1) - } - - fmt.Println("Available grammars:") - for _, g := range grammars { - files, err := grammar.DiscoverGrammarFiles(g) - if err != nil { - fmt.Printf(" %s (error: %v)\n", g, err) - continue - } - fmt.Printf(" %s\n", g) - fmt.Printf(" Lexer: %s\n", files.LexerFile) - fmt.Printf(" Parser: %s\n", files.ParserFile) - } - os.Exit(0) - } - + // Parse grammar files from comma-separated argument if grammarArg != "" { files := strings.Split(grammarArg, ",") @@ -103,6 +76,6 @@ func parseFlags() *config.Config { } cfg.GrammarFiles = files } - + return cfg -} \ No newline at end of file +} From cf77e43c7d037d186e095897ecf736f75526d9d8 Mon Sep 17 00:00:00 2001 From: h3n4l Date: Fri, 29 Aug 2025 14:32:50 +0800 Subject: [PATCH 8/9] v1 --- tools/fuzzing/DESIGN.md | 394 ------------------ tools/fuzzing/Makefile | 27 +- tools/fuzzing/README.md | 124 ------ tools/fuzzing/cmd/fuzzer/main.go | 81 ---- tools/fuzzing/internal/config/config.go | 23 + tools/fuzzing/internal/generator/generator.go | 260 +++++++++++- tools/fuzzing/tests/postgresql_test.go | 192 +++++++++ tools/grammar/README.md | 36 -- 8 files changed, 461 insertions(+), 676 deletions(-) delete mode 100644 tools/fuzzing/DESIGN.md delete mode 100644 tools/fuzzing/README.md delete mode 100644 tools/fuzzing/cmd/fuzzer/main.go create mode 100644 tools/fuzzing/tests/postgresql_test.go delete mode 100644 tools/grammar/README.md diff --git a/tools/fuzzing/DESIGN.md b/tools/fuzzing/DESIGN.md deleted file mode 100644 index 5d05ad3..0000000 --- a/tools/fuzzing/DESIGN.md +++ /dev/null @@ -1,394 +0,0 @@ -# Grammar-Aware Fuzzing Tool Design - -## Overview - -A simple fuzzing tool that generates SQL inputs from ANTLR grammar rules to test parser performance on specific constructs. - -## Core Problems & Solutions - -### 1. Target Specific Rules -**Problem**: Performance issues often occur in specific rules (e.g., `createProcedureStatement`) -**Solution**: Allow users to specify starting rule chains - -```bash -./fuzzer --grammar postgresql --start-rule createProcedureStatement --count 100 -./fuzzer --grammar cql --start-rule selectStatement.whereClause --count 50 -``` - -### 2. Recursion Control -**Problem**: Grammar rules can be recursive, causing infinite loops during generation -**Solution**: Limit recursion depth per rule (proven to handle all ANTLR recursion types) - -#### ANTLR 4 Recursion Types - -**Direct Left Recursion:** -```antlr -expr: expr '+' expr | INT // expr directly refers to itself on left -``` - -**Direct Right Recursion:** -```antlr -expr: INT '+' expr | INT // expr directly refers to itself on right -``` - -**Indirect Recursion (Non-Left):** -```antlr -selectStmt: SELECT columns fromClause whereClause? -whereClause: WHERE expr -expr: '(' selectStmt ')' | INT // Indirect: expr -> selectStmt -> whereClause -> expr -``` -*Note: ANTLR 4 does NOT support mutually left recursive grammars. This example is valid because the recursion is not left-recursive (selectStmt doesn't start with selectStmt).* - -**Self-Recursion with Alternatives:** -```antlr -stmt: ifStmt | whileStmt | blockStmt -blockStmt: '{' stmt* '}' // blockStmt contains multiple stmt references -``` - -#### Why Depth Control Works - -**Theorem**: Any grammar rule expansion terminates in finite steps with depth limiting. - -**Proof by Contradiction:** -1. Assume infinite expansion despite depth limit `D` -2. Each recursive call increases depth: `depth(rule_n) = depth(rule_{n-1}) + 1` -3. When `depth ≥ D`, generator forces terminal selection -4. Therefore, maximum expansion depth is bounded by `D` -5. Since each rule has finite alternatives and finite elements, total expansion is finite ∎ - -#### Depth Control Implementation - -```go -func (g *Generator) GenerateFromRule(ruleName string, currentDepth int) string { - // Base case: exceed depth limit -> force terminal - if currentDepth >= g.maxDepth { - return g.forceTerminal(ruleName) - } - - rule := g.grammar.GetRule(ruleName) - - // Prefer non-recursive alternatives as depth increases - alternative := g.selectAlternativeWithDepthBias(rule, currentDepth) - - result := "" - for _, element := range alternative { - if element.IsRule() { - // Recursive call with incremented depth - result += g.GenerateFromRule(element.Name, currentDepth+1) - } else { - result += element.Literal - } - } - return result -} - -func (g *Generator) forceTerminal(ruleName string) string { - rule := g.grammar.GetRule(ruleName) - - // Find non-recursive alternatives (containing only terminals) - for _, alt := range rule.Alternatives { - if !alt.ContainsRecursion() { - return g.expandAlternative(alt, g.maxDepth) - } - } - - // Fallback: use default terminal for this rule type - return g.getDefaultTerminal(ruleName) -} -``` - -#### Examples with Depth Control - -```bash -./fuzzer --start-rule expr --max-depth 3 --count 5 -``` - -**Generated sequences:** -- Depth 0: `INT` (terminal) -- Depth 1: `INT + INT` -- Depth 2: `(INT + INT) + INT` -- Depth 3: `((INT + INT) + INT) + INT` (max depth reached) - -**Complex mutual recursion:** -```bash -./fuzzer --start-rule selectStmt --max-depth 4 --count 3 -``` - -**Expansion trace:** -``` -selectStmt (depth=0) -├── SELECT columns FROM table whereClause (depth=0) - └── whereClause (depth=1) - └── WHERE expr (depth=1) - └── '(' selectStmt ')' (depth=2) - └── selectStmt (depth=2) - └── SELECT columns FROM table (depth=2, no whereClause to avoid depth=4) -``` - -#### Depth Strategy Options - -**Conservative (Early Termination):** -- Lower max depth (3-5) -- Bias toward terminals as depth increases -- Prevents deep nesting, faster generation - -**Aggressive (Deep Testing):** -- Higher max depth (10-15) -- Equal probability until max depth -- Tests parser limits, slower generation - -```bash -# Conservative - quick, shallow testing -./fuzzer --start-rule expr --max-depth 3 --depth-strategy conservative - -# Aggressive - deep parser stress testing -./fuzzer --start-rule createProcedureStmt --max-depth 12 --depth-strategy aggressive -``` - -### 3. Optional Rule Probability -**Problem**: Optional rules (`selectStmt: SELECT columns FROM table whereClause?`) need probability control -**Solution**: Configure probability for optional elements (standard in grammar-based fuzzing) - -### 4. Quantified Rule Generation -**Problem**: Quantified rules (`stmt*`, `expr+`, `column{1,5}`) need count control -**Solution**: Configure generation counts for quantified elements - -#### ANTLR 4 Quantifier Types - -**Zero or More (`rule*`):** -```antlr -blockStmt: '{' stmt* '}' // Generate 0 to N statements -selectList: column (',' column)* // Generate 1 to N columns -``` - -**One or More (`rule+`):** -```antlr -identifier: LETTER (LETTER | DIGIT)+ // Generate 1 to N characters -``` - -**Note**: ANTLR v4 does not support `{n}` or `{n,m}` quantifier syntax. These are regex-style quantifiers not supported in ANTLR grammar files. - -#### Quantifier Control Strategy - -**Count Distribution Options:** -- **Uniform**: Equal probability for each count in range -- **Exponential**: Higher probability for lower counts (realistic) -- **Fixed**: Always generate specific count - -```bash -# Basic usage - user specifies max count -./fuzzer --start-rule blockStmt --max-quantifier 10 --count 100 - -# User controls both min and max for quantifiers -./fuzzer --start-rule selectList --min-quantifier 1 --max-quantifier 5 --count 50 - -# Fixed count for performance testing -./fuzzer --start-rule selectStmt --quantifier-count 100 --count 10 -``` - -#### Implementation Logic - -```go -type QuantifierConfig struct { - Strategy string // "uniform", "exponential", "fixed" - MinRepeat int // Minimum repetitions (overrides grammar min) - MaxRepeat int // Maximum repetitions (overrides grammar max) - FixedCount int // Fixed count for "fixed" strategy -} - -func (g *Generator) generateQuantified(element *GrammarElement, config QuantifierConfig) string { - var count int - - switch element.Quantifier { - case "*": // Zero or more - min := max(0, config.MinRepeat) - max := min(config.MaxRepeat, 50) // Reasonable default limit - count = g.selectCount(min, max, config.Strategy) - - case "+": // One or more - min := max(1, config.MinRepeat) - max := min(config.MaxRepeat, 50) - count = g.selectCount(min, max, config.Strategy) - - // Note: ANTLR v4 does not support {n} or {min,max} syntax - } - - result := "" - for i := 0; i < count; i++ { - if element.IsRule() { - result += g.GenerateFromRule(element.RuleName, g.currentDepth+1) - } else { - result += element.Literal - } - - // Add separators for lists (e.g., comma-separated) - if i < count-1 && element.HasSeparator() { - result += element.Separator - } - } - return result -} - -func (g *Generator) selectCount(min, max int, strategy string) int { - if min > max { - return min - } - - switch strategy { - case "fixed": - return min // Use minimum as fixed value - - case "uniform": - return min + g.random.Intn(max-min+1) - - case "exponential": - // Exponential decay: higher probability for lower counts - range_size := max - min + 1 - // Generate exponentially distributed number, then map to range - lambda := 2.0 / float64(range_size) - exp_val := g.random.ExpFloat64() / lambda - count := min + int(exp_val) - if count > max { - count = max - } - return count - - default: - return min + g.random.Intn(max-min+1) - } -} -``` - -#### Examples with Quantifier Control - -**Block statement with multiple statements:** -```bash -./fuzzer --start-rule blockStmt --quantifier-strategy exponential --max-repeat 8 -``` -**Generated:** -- 70% chance: `{ stmt; }` (1 statement) -- 20% chance: `{ stmt; stmt; }` (2 statements) -- 7% chance: `{ stmt; stmt; stmt; }` (3 statements) -- 3% chance: 4+ statements - -**Column list generation:** -```bash -./fuzzer --start-rule selectList --quantifier-strategy uniform --min-repeat 3 --max-repeat 7 -``` -**Generated:** -- Equal probability: `col1, col2, col3` to `col1, col2, col3, col4, col5, col6, col7` - -**Performance testing with large lists:** -```bash -./fuzzer --start-rule selectStmt --quantifier-count 100 --count 5 -``` -**Generated:** -- Always generates exactly 100 columns to test parser performance on large SELECT lists - -**Simple user control:** -```bash -./fuzzer --start-rule blockStmt --max-quantifier 3 --count 10 -``` -**Generated:** -- `stmt*` generates 0-3 statements -- `expr+` generates 1-3 expressions -- User controls maximum without complex strategy options - -```bash -./fuzzer --start-rule selectStmt --optional-prob 0.7 --count 100 -# 70% chance to include optional whereClause -``` - -## Simple Architecture - -``` -tools/fuzzing/ -├── main.go # CLI entry point -├── generator.go # Core generation logic -└── grammar_parser.go # Reuse tools/grammar/ -``` - -## Core Logic - -```go -type Generator struct { - grammar *ParsedGrammar - maxDepth int - optionalProb float64 - random *rand.Rand -} - -func (g *Generator) GenerateFromRule(ruleName string, currentDepth int) string { - if currentDepth > g.maxDepth { - return g.generateTerminal() // Stop recursion - } - - rule := g.grammar.GetRule(ruleName) - alternative := g.selectAlternative(rule) - - result := "" - for _, element := range alternative { - if element.IsOptional() && g.random.Float64() > g.optionalProb { - continue // Skip optional element - } - if element.IsRule() { - result += g.GenerateFromRule(element.Name, currentDepth+1) - } else { - result += element.Literal - } - } - return result -} -``` - -## CLI Interface - -```bash -# Basic usage - generate from specific rule -./fuzzer --grammar postgresql --start-rule selectStmt --count 10 - -# Control recursion depth -./fuzzer --grammar cql --start-rule expr --max-depth 3 --count 5 - -# Control optional probability -./fuzzer --grammar postgresql --start-rule createStmt --optional-prob 0.8 --count 10 - -# Control quantifier max count (for rule*, rule+) -./fuzzer --grammar postgresql --start-rule blockStmt --max-quantifier 8 --count 20 - -# Control all parameters together -./fuzzer --grammar cql --start-rule selectStmt \ - --max-depth 5 \ - --optional-prob 0.7 \ - --max-quantifier 10 \ - --count 50 - -# Output to file -./fuzzer --grammar postgresql --start-rule selectStmt --count 100 --output queries.sql -``` - -## Implementation Steps - -### Step 1: Basic Generator -- Parse grammar using existing `tools/grammar/` -- Simple rule expansion with depth limit -- CLI with `--start-rule`, `--max-depth`, `--count` - -### Step 2: Optional Control -- Add `--optional-prob` flag -- Detect optional elements in grammar rules -- Apply probability during generation - -### Step 3: Integration -- Test generated queries against parsers -- Add basic performance timing -- CI integration for regression testing - -## Common Fuzzing Techniques Used - -1. **Grammar-based generation** - Generate from formal grammar rules -2. **Depth limiting** - Prevent infinite recursion in recursive grammars -3. **Probability-based selection** - Control optional rule inclusion -4. **Targeted fuzzing** - Focus on specific rule paths instead of full grammar - -This approach is much simpler but addresses your specific needs for testing parser performance on particular constructs. \ No newline at end of file diff --git a/tools/fuzzing/Makefile b/tools/fuzzing/Makefile index 20503f2..227ca53 100644 --- a/tools/fuzzing/Makefile +++ b/tools/fuzzing/Makefile @@ -1,16 +1,6 @@ -BINARY_NAME=fuzzer -BUILD_DIR=bin -CMD_PATH=github.com/bytebase/parser/tools/fuzzing/cmd/fuzzer +.PHONY: all test clean help -.PHONY: all build test clean run help - -all: build test - -# Build the binary -build: - @echo "Building $(BINARY_NAME)..." - @mkdir -p $(BUILD_DIR) - go build -o $(BUILD_DIR)/$(BINARY_NAME) $(CMD_PATH) +all: test # Run tests test: @@ -20,13 +10,8 @@ test: # Clean build artifacts clean: @echo "Cleaning..." - rm -rf $(BUILD_DIR) go clean -# Run the fuzzer (requires arguments) -run: - go run $(CMD_PATH) $(ARGS) - # Install dependencies deps: @echo "Installing dependencies..." @@ -45,15 +30,9 @@ lint: # Show help help: @echo "Available targets:" - @echo " build - Build the fuzzer binary" @echo " test - Run all tests" @echo " clean - Clean build artifacts" - @echo " run - Run the fuzzer (use ARGS='--grammar postgresql --start-rule selectStmt')" @echo " deps - Install/update dependencies" @echo " fmt - Format all Go code" @echo " lint - Run golangci-lint" - @echo " help - Show this help message" - @echo "" - @echo "Examples:" - @echo " make run ARGS='--grammar postgresql/PostgreSQLLexer.g4,postgresql/PostgreSQLParser.g4 --start-rule selectStmt --count 5'" - @echo " make run ARGS='--help'" \ No newline at end of file + @echo " help - Show this help message" \ No newline at end of file diff --git a/tools/fuzzing/README.md b/tools/fuzzing/README.md deleted file mode 100644 index f496ff1..0000000 --- a/tools/fuzzing/README.md +++ /dev/null @@ -1,124 +0,0 @@ -# Grammar-Aware Fuzzing Tool - -A fuzzing tool that generates valid SQL inputs from ANTLR v4 grammar files for parser testing. - -## Quick Start - -```bash -# Build the fuzzer -make build - -# List available grammars -./bin/fuzzer --list-grammars - -# Single combined grammar file -./bin/fuzzer --grammar combined.g4 --start-rule selectStmt --count 10 - -# Separate lexer and parser files -./bin/fuzzer --grammar postgresql/PostgreSQLLexer.g4,postgresql/PostgreSQLParser.g4 --start-rule selectStmt --count 10 - -# Run with custom parameters -./bin/fuzzer --grammar cql/CqlLexer.g4,cql/CqlParser.g4 --start-rule expr --max-depth 3 --max-quantifier 8 --count 5 -``` - -## Project Structure - -``` -tools/fuzzing/ -├── cmd/fuzzer/ # CLI application entry point -│ └── main.go -├── internal/ # Private application packages -│ ├── config/ # Configuration management -│ └── generator/ # Core fuzzing logic -├── bin/ # Built binaries (created by make build) -├── Makefile # Build and development tasks -└── go.mod # Go module definition -``` - -## CLI Options - -| Flag | Description | Default | -|------|-------------|---------| -| `--grammar` | Grammar file(s): single file or comma-separated lexer,parser | - | -| `--start-rule` | Starting grammar rule (required) | - | -| `--count` | Number of queries to generate | 10 | -| `--max-depth` | Maximum recursion depth | 5 | -| `--optional-prob` | Probability of optional elements (0.0-1.0) | 0.5 | -| `--max-quantifier` | Maximum count for `*` and `+` quantifiers | 5 | -| `--min-quantifier` | Minimum count override | 0 | -| `--quantifier-count` | Fixed count for all quantifiers | 0 | -| `--output` | Output file path | stdout | -| `--seed` | Random seed for reproducible results | current time | - -## Examples - -### Basic Usage -```bash -# Single combined grammar file -./bin/fuzzer --grammar combined.g4 --start-rule selectStmt --count 10 - -# Separate lexer and parser files -./bin/fuzzer --grammar postgresql/PostgreSQLLexer.g4,postgresql/PostgreSQLParser.g4 --start-rule selectStmt --count 10 - -# Generate CQL expressions with limited depth -./bin/fuzzer --grammar cql/CqlLexer.g4,cql/CqlParser.g4 --start-rule expr --max-depth 3 --count 5 -``` - -### Performance Testing -```bash -# Generate queries with exactly 100 columns -./bin/fuzzer --grammar postgresql/PostgreSQLLexer.g4,postgresql/PostgreSQLParser.g4 --start-rule selectStmt --quantifier-count 100 --count 5 - -# Generate deeply nested expressions -./bin/fuzzer --grammar cql/CqlLexer.g4,cql/CqlParser.g4 --start-rule expr --max-depth 15 --count 10 -``` - -### Output Control -```bash -# Save to file -./bin/fuzzer --grammar postgresql/PostgreSQLLexer.g4,postgresql/PostgreSQLParser.g4 --start-rule selectStmt --count 100 --output queries.sql - -# Reproducible generation -./bin/fuzzer --grammar cql/CqlLexer.g4,cql/CqlParser.g4 --start-rule expr --seed 42 --count 10 -``` - -## Development - -### Build Commands -```bash -# From tools/fuzzing directory -make build # Build binary to bin/fuzzer -make test # Run all tests -make clean # Clean build artifacts -make fmt # Format code -make deps # Install/update dependencies (runs from repo root) - -# From repository root -go build -o tools/fuzzing/bin/fuzzer github.com/bytebase/parser/tools/fuzzing/cmd/fuzzer -``` - -### Running During Development -```bash -# From tools/fuzzing directory -make run ARGS='--grammar postgresql/PostgreSQLLexer.g4,postgresql/PostgreSQLParser.g4 --start-rule selectStmt --count 5' -make run ARGS='--help' - -# From repository root -go run github.com/bytebase/parser/tools/fuzzing/cmd/fuzzer --grammar postgresql/PostgreSQLLexer.g4,postgresql/PostgreSQLParser.g4 --start-rule selectStmt --count 5 -``` - -## Monolithic Repository Structure - -This tool uses the single `go.mod` file at the repository root: -- **Module**: `github.com/bytebase/parser` -- **Import path**: `github.com/bytebase/parser/tools/fuzzing/...` -- **Dependencies**: Shared with other tools in the repository - -## Integration - -This tool is designed to integrate with: -- Existing ANTLR v4 grammar parser at `tools/grammar/` -- All parser implementations in the repository (postgresql, cql, redshift, etc.) -- Shared CI/CD pipeline and testing infrastructure - -**TODO**: Grammar parser integration and actual query generation logic. \ No newline at end of file diff --git a/tools/fuzzing/cmd/fuzzer/main.go b/tools/fuzzing/cmd/fuzzer/main.go deleted file mode 100644 index c388520..0000000 --- a/tools/fuzzing/cmd/fuzzer/main.go +++ /dev/null @@ -1,81 +0,0 @@ -package main - -import ( - "flag" - "fmt" - "os" - "strings" - "time" - - "github.com/bytebase/parser/tools/fuzzing/internal/config" - "github.com/bytebase/parser/tools/fuzzing/internal/generator" -) - -func main() { - cfg := parseFlags() - - if err := cfg.Validate(); err != nil { - fmt.Fprintf(os.Stderr, "Error: %v\n", err) - os.Exit(1) - } - - cfg.Print() - - gen := generator.New(cfg) - if err := gen.Generate(); err != nil { - fmt.Fprintf(os.Stderr, "Generation failed: %v\n", err) - os.Exit(1) - } - - fmt.Println("Generation completed successfully!") -} - -func parseFlags() *config.Config { - cfg := &config.Config{} - var grammarArg string - - flag.StringVar(&grammarArg, "grammar", "", "Grammar file(s): single file or comma-separated lexer,parser files") - flag.StringVar(&cfg.StartRule, "start-rule", "", "Starting grammar rule name") - flag.IntVar(&cfg.Count, "count", 10, "Number of queries to generate") - flag.IntVar(&cfg.MaxDepth, "max-depth", 5, "Maximum recursion depth") - flag.Float64Var(&cfg.OptionalProb, "optional-prob", 0.5, "Probability of including optional elements (0.0-1.0)") - flag.IntVar(&cfg.MaxQuantifier, "max-quantifier", 5, "Maximum count for quantified rules (* and +)") - flag.IntVar(&cfg.MinQuantifier, "min-quantifier", 0, "Minimum count for quantified rules (overrides grammar)") - flag.IntVar(&cfg.QuantifierCount, "quantifier-count", 0, "Fixed count for all quantifiers (overrides min/max)") - flag.StringVar(&cfg.Output, "output", "", "Output file path (default: stdout)") - flag.Int64Var(&cfg.Seed, "seed", time.Now().UnixNano(), "Random seed for reproducible generation") - - // Custom usage message - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Grammar-Aware Fuzzing Tool\n\n") - fmt.Fprintf(os.Stderr, "Usage: %s [options]\n\n", os.Args[0]) - fmt.Fprintf(os.Stderr, "Options:\n") - flag.PrintDefaults() - - fmt.Fprintf(os.Stderr, "\nExamples:\n") - fmt.Fprintf(os.Stderr, " # Single combined grammar file\n") - fmt.Fprintf(os.Stderr, " %s --grammar combined.g4 --start-rule selectStmt --count 10\n\n", os.Args[0]) - fmt.Fprintf(os.Stderr, " # Separate lexer and parser files\n") - fmt.Fprintf(os.Stderr, " %s --grammar postgresql/PostgreSQLLexer.g4,postgresql/PostgreSQLParser.g4 --start-rule selectStmt --count 10\n\n", os.Args[0]) - fmt.Fprintf(os.Stderr, " # Control recursion and quantifiers\n") - fmt.Fprintf(os.Stderr, " %s --grammar cql/CqlLexer.g4,cql/CqlParser.g4 --start-rule expr --max-depth 3 --max-quantifier 8 --count 5\n\n", os.Args[0]) - fmt.Fprintf(os.Stderr, " # Performance testing\n") - fmt.Fprintf(os.Stderr, " %s --grammar redshift/RedshiftLexer.g4,redshift/RedshiftParser.g4 --start-rule blockStmt --quantifier-count 100 --count 10\n\n", os.Args[0]) - fmt.Fprintf(os.Stderr, " # Output to file\n") - fmt.Fprintf(os.Stderr, " %s --grammar postgresql/PostgreSQLLexer.g4,postgresql/PostgreSQLParser.g4 --start-rule selectStmt --count 100 --output queries.sql\n\n", os.Args[0]) - } - - flag.Parse() - - // Parse grammar files from comma-separated argument - if grammarArg != "" { - files := strings.Split(grammarArg, ",") - // Trim whitespace from each file - for i, file := range files { - files[i] = strings.TrimSpace(file) - } - cfg.GrammarFiles = files - } - - return cfg -} diff --git a/tools/fuzzing/internal/config/config.go b/tools/fuzzing/internal/config/config.go index d976e37..3d4e27a 100644 --- a/tools/fuzzing/internal/config/config.go +++ b/tools/fuzzing/internal/config/config.go @@ -6,6 +6,28 @@ import ( "github.com/pkg/errors" ) +// OutputFormat represents different output formatting options +type OutputFormat int + +const ( + // CompactOutput shows cleaner, more readable output (default) + CompactOutput OutputFormat = iota + // VerboseOutput shows full grammar rule traversal with comments + VerboseOutput +) + +// ParseOutputFormat parses a string into an OutputFormat +func ParseOutputFormat(s string) OutputFormat { + switch s { + case "compact", "": + return CompactOutput + case "verbose": + return VerboseOutput + default: + return CompactOutput + } +} + // Config holds all configuration options for the fuzzer type Config struct { GrammarFiles []string // Can be one file (combined) or two files (lexer,parser) @@ -17,6 +39,7 @@ type Config struct { MinQuantifier int QuantifierCount int Output string + OutputFormat OutputFormat // How to format the output Seed int64 } diff --git a/tools/fuzzing/internal/generator/generator.go b/tools/fuzzing/internal/generator/generator.go index a4b7840..9eb2ea6 100644 --- a/tools/fuzzing/internal/generator/generator.go +++ b/tools/fuzzing/internal/generator/generator.go @@ -3,6 +3,7 @@ package generator import ( "fmt" "math/rand" + "strings" "github.com/bytebase/parser/tools/fuzzing/internal/config" "github.com/bytebase/parser/tools/fuzzing/internal/grammar" @@ -16,11 +17,19 @@ type Generator struct { grammar *grammar.ParsedGrammar } +// WorkItem represents a unit of work in the generation stack +type WorkItem struct { + RuleName string + Depth int + Result *string // Pointer to where the result should be stored +} + // New creates a new generator with the given configuration func New(cfg *config.Config) *Generator { return &Generator{ - config: cfg, - random: rand.New(rand.NewSource(cfg.Seed)), + config: cfg, + random: rand.New(rand.NewSource(cfg.Seed)), + grammar: nil, } } @@ -58,18 +67,19 @@ func (g *Generator) getRule(ruleName string) *grammar.Rule { return g.grammar.GetRule(ruleName) } + // generateQuery creates a single query using grammar rules func (g *Generator) generateQuery(index int) string { - // Start generation from the specified start rule + // Start generation from the specified start rule with no recursion limit for now result := g.generateFromRule(g.config.StartRule, 0) return result } -// generateFromRule recursively generates text from a grammar rule +// generateFromRule generates text from a grammar rule func (g *Generator) generateFromRule(ruleName string, currentDepth int) string { // Check depth limit to prevent infinite recursion if currentDepth >= g.config.MaxDepth { - return g.generateTerminal(ruleName) + return fmt.Sprintf("<%s_MAX_DEPTH>", ruleName) } // Get the rule @@ -96,7 +106,18 @@ func (g *Generator) generateFromRule(ruleName string, currentDepth int) string { } } - return fmt.Sprintf("/* %s */ %s", ruleName, joinWithSpaces(result)) + // Format output based on configuration + switch g.config.OutputFormat { + case config.CompactOutput: + // Clean, readable output without verbose comments (default) + return joinWithSpaces(result) + case config.VerboseOutput: + // Full grammar rule traversal with comments + return fmt.Sprintf("/* %s */ %s", ruleName, joinWithSpaces(result)) + default: + // Default to compact + return joinWithSpaces(result) + } } // generateFromElement generates text from a single grammar element @@ -114,11 +135,11 @@ func (g *Generator) generateFromElement(element *grammar.Element, currentDepth i // Generate single element if element.IsRule() { if refValue, ok := element.Value.(grammar.ReferenceValue); ok { - return g.generateFromRule(refValue.Name, currentDepth+1) + return g.generateFromRuleOrToken(refValue.Name, currentDepth+1) } else if blockValue, ok := element.Value.(grammar.BlockValue); ok { return g.generateFromBlock(blockValue, currentDepth) } - return g.generateFromRule(element.Value.String(), currentDepth+1) + return g.generateFromRuleOrToken(element.Value.String(), currentDepth+1) } else if element.IsTerminal() { if litValue, ok := element.Value.(grammar.LiteralValue); ok { return cleanLiteral(litValue.Text) @@ -151,10 +172,13 @@ func (g *Generator) generateQuantified(element *grammar.Element, currentDepth in for i := 0; i < count; i++ { if element.IsRule() { if refValue, ok := element.Value.(grammar.ReferenceValue); ok { - result := g.generateFromRule(refValue.Name, currentDepth+1) + result := g.generateFromRuleOrToken(refValue.Name, currentDepth+1) + results = append(results, result) + } else if blockValue, ok := element.Value.(grammar.BlockValue); ok { + result := g.generateFromBlock(blockValue, currentDepth+1) results = append(results, result) } else { - result := g.generateFromRule(element.Value.String(), currentDepth+1) + result := g.generateFromRuleOrToken(element.Value.String(), currentDepth+1) results = append(results, result) } } else if element.IsTerminal() { @@ -163,9 +187,6 @@ func (g *Generator) generateQuantified(element *grammar.Element, currentDepth in } else { results = append(results, cleanLiteral(element.Value.String())) } - } else if blockValue, ok := element.Value.(grammar.BlockValue); ok { - result := g.generateFromBlock(blockValue, currentDepth+1) - results = append(results, result) } } @@ -194,12 +215,217 @@ func (g *Generator) generateFromBlock(blockValue grammar.BlockValue, currentDept return joinWithSpaces(result) } -// generateTerminal generates a terminal when depth limit is reached -func (g *Generator) generateTerminal(ruleName string) string { - // For depth-limited cases, return a simple placeholder - return fmt.Sprintf("<%s_TERM>", ruleName) + +// generateFromRuleOrToken generates from a rule using standard rule-based generation +func (g *Generator) generateFromRuleOrToken(ruleName string, currentDepth int) string { + // Check if this is a lexer rule and generate concrete token + if rule := g.grammar.GetRule(ruleName); rule != nil && rule.IsLexer { + return g.generateConcreteToken(ruleName) + } + + // Otherwise expand as parser rule + return g.generateFromRule(ruleName, currentDepth) +} + +// generateConcreteToken generates concrete tokens by expanding lexer rules +func (g *Generator) generateConcreteToken(ruleName string) string { + // Get the lexer rule + rule := g.grammar.GetRule(ruleName) + if rule == nil || !rule.IsLexer { + return fmt.Sprintf("<%s>", ruleName) + } + + // For lexer rules, we need to expand them but generate concrete characters + // at the terminal level (character sets, literals, etc.) + return g.generateFromLexerRule(rule, 0) } +// generateFromLexerRule generates content from a lexer rule +func (g *Generator) generateFromLexerRule(rule *grammar.Rule, currentDepth int) string { + if len(rule.Alternatives) == 0 { + return "" + } + + // Select a random alternative + altIndex := g.random.Intn(len(rule.Alternatives)) + alternative := rule.Alternatives[altIndex] + + // Generate from all elements in the alternative + var result []string + for _, element := range alternative.Elements { + elementResult := g.generateFromLexerElement(&element, currentDepth) + if elementResult != "" { + result = append(result, elementResult) + } + } + + return strings.Join(result, "") +} + +// generateFromLexerElement generates content from a lexer element +func (g *Generator) generateFromLexerElement(element *grammar.Element, currentDepth int) string { + // Handle optional elements + if element.IsOptional() && g.random.Float64() > g.config.OptionalProb { + return "" // Skip optional element + } + + // Handle quantified elements + if element.IsQuantified() { + return g.generateQuantifiedLexer(element, currentDepth) + } + + // Generate single element + if element.IsRule() { + if refValue, ok := element.Value.(grammar.ReferenceValue); ok { + // Check if referenced rule is lexer or parser + if referencedRule := g.grammar.GetRule(refValue.Name); referencedRule != nil && referencedRule.IsLexer { + return g.generateFromLexerRule(referencedRule, currentDepth+1) + } else { + // Parser rule - shouldn't happen in lexer context, but handle it + return g.generateFromRule(refValue.Name, currentDepth+1) + } + } else if blockValue, ok := element.Value.(grammar.BlockValue); ok { + return g.generateFromLexerBlock(blockValue, currentDepth) + } + return element.Value.String() + } else if element.IsTerminal() { + if litValue, ok := element.Value.(grammar.LiteralValue); ok { + return g.generateFromLiteral(litValue.Text) + } + return g.generateFromLiteral(element.Value.String()) + } + + return element.Value.String() +} + +// generateQuantifiedLexer handles quantified lexer elements +func (g *Generator) generateQuantifiedLexer(element *grammar.Element, currentDepth int) string { + var count int + + // Use fixed count if specified, otherwise use random count + if g.config.QuantifierCount > 0 { + count = g.config.QuantifierCount + } else { + switch element.Quantifier { + case grammar.ZERO_MORE: // * + count = g.random.Intn(g.config.MaxQuantifier + 1) // 0 to MaxQuantifier + case grammar.ONE_MORE: // + + count = 1 + g.random.Intn(g.config.MaxQuantifier) // 1 to MaxQuantifier + default: + count = 1 + } + } + + var results []string + for i := 0; i < count; i++ { + result := g.generateFromLexerElement(&grammar.Element{ + Value: element.Value, + Quantifier: grammar.NONE, // Remove quantifier for individual generation + }, currentDepth+1) + if result != "" { + results = append(results, result) + } + } + + return strings.Join(results, "") +} + +// generateFromLexerBlock generates content from a lexer block +func (g *Generator) generateFromLexerBlock(blockValue grammar.BlockValue, currentDepth int) string { + if len(blockValue.Alternatives) == 0 { + return "" + } + + // Select a random alternative from the block + altIndex := g.random.Intn(len(blockValue.Alternatives)) + alternative := blockValue.Alternatives[altIndex] + + // Generate from all elements in the selected alternative + var result []string + for _, element := range alternative.Elements { + elementResult := g.generateFromLexerElement(&element, currentDepth) + if elementResult != "" { + result = append(result, elementResult) + } + } + + return strings.Join(result, "") +} + +// generateFromLiteral generates concrete characters from lexer literals and character sets +func (g *Generator) generateFromLiteral(literal string) string { + // Handle character sets like ~[\u0000"] or [a-zA-Z_] + if strings.HasPrefix(literal, "~[") && strings.HasSuffix(literal, "]") { + return g.generateFromNegatedSet(literal[2 : len(literal)-1]) + } else if strings.HasPrefix(literal, "[") && strings.HasSuffix(literal, "]") { + return g.generateFromCharacterSet(literal[1 : len(literal)-1]) + } + + // Handle string literals + if strings.HasPrefix(literal, "'") && strings.HasSuffix(literal, "'") && len(literal) >= 2 { + return literal[1 : len(literal)-1] // Remove quotes + } + + // Handle special escape sequences + switch literal { + case "\\r": + return "\r" + case "\\n": + return "\n" + case "\\t": + return "\t" + case "\\\"": + return "\"" + case "\\'": + return "'" + case "\\\\": + return "\\" + } + + // Return as-is for other cases + return literal +} + +// generateFromCharacterSet generates a random character from a character set like [a-zA-Z_] +func (g *Generator) generateFromCharacterSet(charset string) string { + chars := []rune{} + + // Simple character set expansion - handle ranges like a-z, A-Z, 0-9 + i := 0 + for i < len(charset) { + if i+2 < len(charset) && charset[i+1] == '-' { + // Handle range like a-z + start := rune(charset[i]) + end := rune(charset[i+2]) + for r := start; r <= end; r++ { + chars = append(chars, r) + } + i += 3 + } else { + // Single character + chars = append(chars, rune(charset[i])) + i++ + } + } + + if len(chars) == 0 { + return "x" // Fallback + } + + return string(chars[g.random.Intn(len(chars))]) +} + +// generateFromNegatedSet generates a character NOT in the specified set +func (g *Generator) generateFromNegatedSet(negatedSet string) string { + // For simplicity, generate common safe characters that are typically not in negated sets + safeChars := []string{"a", "b", "c", "x", "y", "z", "_", "1", "2", "3"} + + // TODO: Implement proper negated set handling by expanding the set and excluding those characters + // For now, just return a safe character + return safeChars[g.random.Intn(len(safeChars))] +} + + // cleanLiteral removes quotes from literal strings func cleanLiteral(literal string) string { // Remove single quotes from literals like 'SELECT' diff --git a/tools/fuzzing/tests/postgresql_test.go b/tools/fuzzing/tests/postgresql_test.go new file mode 100644 index 0000000..fa067a4 --- /dev/null +++ b/tools/fuzzing/tests/postgresql_test.go @@ -0,0 +1,192 @@ +package tests + +import ( + "fmt" + "path/filepath" + "runtime" + "testing" + + "github.com/bytebase/parser/tools/fuzzing/internal/config" + "github.com/bytebase/parser/tools/fuzzing/internal/generator" +) + +// getRepoRoot finds the repository root directory +func getRepoRoot() string { + _, filename, _, _ := runtime.Caller(0) + // Go up from tools/fuzzing/tests to the repo root + return filepath.Join(filepath.Dir(filename), "..", "..", "..") +} + +func TestPostgreSQLSelectStmt(t *testing.T) { + repoRoot := getRepoRoot() + + // PostgreSQL grammar file paths + lexerPath := filepath.Join(repoRoot, "postgresql", "PostgreSQLLexer.g4") + parserPath := filepath.Join(repoRoot, "postgresql", "PostgreSQLParser.g4") + + tests := []struct { + name string + startRule string + count int + maxDepth int + optionalProb float64 + seed int64 + }{ + { + name: "Simple SELECT statements", + startRule: "selectstmt", + count: 3, + maxDepth: 5, + optionalProb: 0.7, + seed: 42, + }, + { + name: "Deep SELECT statements", + startRule: "selectstmt", + count: 2, + maxDepth: 8, + optionalProb: 0.5, + seed: 123, + }, + { + name: "Minimal SELECT statements", + startRule: "selectstmt", + count: 5, + maxDepth: 3, + optionalProb: 0.3, + seed: 456, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cfg := &config.Config{ + GrammarFiles: []string{lexerPath, parserPath}, + StartRule: tt.startRule, + Count: tt.count, + MaxDepth: tt.maxDepth, + OptionalProb: tt.optionalProb, + MaxQuantifier: 3, + MinQuantifier: 1, + QuantifierCount: 0, + OutputFormat: config.CompactOutput, + Seed: tt.seed, + } + + fmt.Printf("\n=== %s ===\n", tt.name) + fmt.Printf("Config: MaxDepth=%d, OptionalProb=%.1f, Count=%d, Seed=%d\n", + tt.maxDepth, tt.optionalProb, tt.count, tt.seed) + fmt.Println() + + gen := generator.New(cfg) + err := gen.Generate() + + if err != nil { + t.Errorf("Failed to generate PostgreSQL %s: %v", tt.startRule, err) + } else { + t.Logf("Successfully generated %d PostgreSQL %s statements", tt.count, tt.startRule) + } + }) + } +} + +func TestPostgreSQLExpressions(t *testing.T) { + repoRoot := getRepoRoot() + + // PostgreSQL grammar file paths + lexerPath := filepath.Join(repoRoot, "postgresql", "PostgreSQLLexer.g4") + parserPath := filepath.Join(repoRoot, "postgresql", "PostgreSQLParser.g4") + + cfg := &config.Config{ + GrammarFiles: []string{lexerPath, parserPath}, + StartRule: "a_expr", // PostgreSQL expression rule + Count: 5, + MaxDepth: 4, + OptionalProb: 0.6, + MaxQuantifier: 2, + MinQuantifier: 1, + QuantifierCount: 0, + OutputFormat: config.CompactOutput, + Seed: 789, + } + + fmt.Printf("\n=== PostgreSQL Expressions ===\n") + fmt.Printf("Generating %d expressions with max depth %d\n", cfg.Count, cfg.MaxDepth) + fmt.Println() + + gen := generator.New(cfg) + err := gen.Generate() + + if err != nil { + t.Errorf("Failed to generate PostgreSQL expressions: %v", err) + } else { + t.Logf("Successfully generated %d PostgreSQL expressions", cfg.Count) + } +} + +func TestPostgreSQLVerboseOutput(t *testing.T) { + repoRoot := getRepoRoot() + + // PostgreSQL grammar file paths + lexerPath := filepath.Join(repoRoot, "postgresql", "PostgreSQLLexer.g4") + parserPath := filepath.Join(repoRoot, "postgresql", "PostgreSQLParser.g4") + + cfg := &config.Config{ + GrammarFiles: []string{lexerPath, parserPath}, + StartRule: "selectstmt", + Count: 2, + MaxDepth: 4, + OptionalProb: 0.8, + MaxQuantifier: 2, + MinQuantifier: 1, + QuantifierCount: 0, + OutputFormat: config.VerboseOutput, // Show rule traversal + Seed: 999, + } + + fmt.Printf("\n=== PostgreSQL Verbose Output ===\n") + fmt.Printf("Generating with verbose output to show rule traversal\n") + fmt.Println() + + gen := generator.New(cfg) + err := gen.Generate() + + if err != nil { + t.Errorf("Failed to generate PostgreSQL statements with verbose output: %v", err) + } else { + t.Logf("Successfully generated PostgreSQL statements with verbose output") + } +} + +// Benchmark test for performance measurement +func BenchmarkPostgreSQLGeneration(b *testing.B) { + repoRoot := getRepoRoot() + + lexerPath := filepath.Join(repoRoot, "postgresql", "PostgreSQLLexer.g4") + parserPath := filepath.Join(repoRoot, "postgresql", "PostgreSQLParser.g4") + + cfg := &config.Config{ + GrammarFiles: []string{lexerPath, parserPath}, + StartRule: "selectstmt", + Count: 1, + MaxDepth: 6, + OptionalProb: 0.5, + MaxQuantifier: 3, + MinQuantifier: 1, + QuantifierCount: 0, + OutputFormat: config.CompactOutput, + Seed: 42, + } + + gen := generator.New(cfg) + + // Reset the timer to exclude setup time + b.ResetTimer() + + for i := 0; i < b.N; i++ { + err := gen.Generate() + if err != nil { + b.Fatalf("Generation failed: %v", err) + } + } +} \ No newline at end of file diff --git a/tools/grammar/README.md b/tools/grammar/README.md deleted file mode 100644 index 2641250..0000000 --- a/tools/grammar/README.md +++ /dev/null @@ -1,36 +0,0 @@ -# ANTLR v4 Grammar Parser - -A Go implementation to parse ANTLR v4 grammar files (`.g4` files) in this repository. - -## Source - -The lexer and parser grammars come from: https://github.com/antlr/grammars-v4/blob/master/antlr/antlr4 - -## Why Custom NextToken()? - -We added `func (l *LexerAdaptor) NextToken() antlr.Token` in `lexer_adaptor.go` because: - -- ANTLR grammar parsing requires context-sensitive lexing -- Need to convert `ID` tokens to `TOKEN_REF` (uppercase) or `RULE_REF` (lowercase) -- Go ANTLR doesn't automatically call `Emit()` like Java ANTLR does -- Go tokens are immutable, so we use a `TokenTypeWrapper` to override token types - -## Why Sed Command in Makefile? - -We added this sed command in the Makefile: -```bash -sed -i '' 's/l\.BaseLexer = antlr\.NewBaseLexer(input)/l.LexerAdaptor = *NewLexerAdaptor(input)/' antlrv4_lexer.go -``` - -Because: -- ANTLR code generation creates `l.BaseLexer = antlr.NewBaseLexer(input)` -- We need `l.LexerAdaptor = *NewLexerAdaptor(input)` to use our custom lexer -- This automatically fixes the generated constructor after each regeneration - -## Usage - -```bash -make build # Generate parser and apply fixes -make test # Test all .g4 files in repository (should show 100% success) -make all # Build and test -``` \ No newline at end of file From 029ad0c8e69a5acace5cd78cb38ce797e2f1d51f Mon Sep 17 00:00:00 2001 From: h3n4l Date: Fri, 29 Aug 2025 14:34:21 +0800 Subject: [PATCH 9/9] chore: go mod tidy --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 9e2614c..8cefc8a 100644 --- a/go.mod +++ b/go.mod @@ -4,12 +4,12 @@ go 1.24.5 require ( github.com/antlr4-go/antlr/v4 v4.13.1 + github.com/pkg/errors v0.9.1 github.com/stretchr/testify v1.10.0 ) require ( github.com/davecgh/go-spew v1.1.1 // indirect - github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect