|
| 1 | +#!/bin/bash |
| 2 | +set -e |
| 3 | + |
| 4 | +# Script to scrape GitHub Copilot supported models from documentation |
| 5 | +# This script uses Puppeteer to load the page and extract model names |
| 6 | + |
| 7 | +echo "Installing Puppeteer..." |
| 8 | +npm install puppeteer |
| 9 | + |
| 10 | +echo "Creating scraper script..." |
| 11 | +cat > scrape.js << 'SCRAPE_EOF' |
| 12 | +const puppeteer = require('puppeteer'); |
| 13 | +const fs = require('fs'); |
| 14 | +
|
| 15 | +(async () => { |
| 16 | + try { |
| 17 | + const browser = await puppeteer.launch({ |
| 18 | + headless: 'new', |
| 19 | + args: ['--no-sandbox', '--disable-setuid-sandbox'] |
| 20 | + }); |
| 21 | + const page = await browser.newPage(); |
| 22 | + |
| 23 | + console.error('Navigating to page...'); |
| 24 | + await page.goto('https://docs.github.com/en/copilot/reference/ai-models/supported-models', { |
| 25 | + waitUntil: 'networkidle0', |
| 26 | + timeout: 60000 |
| 27 | + }); |
| 28 | + |
| 29 | + console.error('Content loaded, extracting models...'); |
| 30 | + |
| 31 | + // Extract model names from the specific section |
| 32 | + const models = await page.evaluate(() => { |
| 33 | + const modelNames = []; |
| 34 | + |
| 35 | + // Find the "Supported AI models in Copilot" section |
| 36 | + const headings = Array.from(document.querySelectorAll('h2, h3')); |
| 37 | + const targetHeading = headings.find(h => h.textContent.includes('Supported AI models in Copilot')); |
| 38 | + |
| 39 | + if (!targetHeading) { |
| 40 | + console.error('ERROR: Could not find "Supported AI models in Copilot" heading'); |
| 41 | + return []; |
| 42 | + } |
| 43 | + |
| 44 | + console.error('Found target heading:', targetHeading.textContent); |
| 45 | + |
| 46 | + // Get the content section that contains this heading |
| 47 | + let contentSection = targetHeading.closest('div[class*="content"]') || targetHeading.parentElement; |
| 48 | + console.error('Content section found:', contentSection ? 'yes' : 'no'); |
| 49 | + |
| 50 | + // Find all tables within this section (or after the heading) |
| 51 | + let tables = []; |
| 52 | + let currentElement = targetHeading.nextElementSibling; |
| 53 | + |
| 54 | + // Traverse siblings until we hit another h2 or run out of elements |
| 55 | + while (currentElement) { |
| 56 | + if (currentElement.tagName === 'H2') { |
| 57 | + break; // Stop at the next major section |
| 58 | + } |
| 59 | + |
| 60 | + if (currentElement.tagName === 'TABLE') { |
| 61 | + tables.push(currentElement); |
| 62 | + } else if (currentElement.querySelectorAll) { |
| 63 | + // Check for tables within this element |
| 64 | + const nestedTables = currentElement.querySelectorAll('table'); |
| 65 | + tables.push(...nestedTables); |
| 66 | + } |
| 67 | + |
| 68 | + currentElement = currentElement.nextElementSibling; |
| 69 | + } |
| 70 | + |
| 71 | + console.error(`Found ${tables.length} tables in the target section`); |
| 72 | + |
| 73 | + tables.forEach((table, tableIndex) => { |
| 74 | + const rows = table.querySelectorAll('tbody tr'); |
| 75 | + console.error(`Table ${tableIndex}: Found ${rows.length} rows`); |
| 76 | + |
| 77 | + rows.forEach((row, rowIndex) => { |
| 78 | + // Look for the row header (th with scope="row") which contains the model name |
| 79 | + const rowHeader = row.querySelector('th[scope="row"]'); |
| 80 | + if (rowHeader) { |
| 81 | + let text = rowHeader.textContent.trim(); |
| 82 | + console.error(`Table ${tableIndex}, Row ${rowIndex}: "${text}"`); |
| 83 | + |
| 84 | + if (text && text.length > 0) { |
| 85 | + // Normalize model name: lowercase and replace spaces with dashes |
| 86 | + const normalizedName = text.toLowerCase().replace(/\s+/g, '-'); |
| 87 | + console.error(` Normalized: "${normalizedName}"`); |
| 88 | + modelNames.push(normalizedName); |
| 89 | + } |
| 90 | + } else { |
| 91 | + // Fallback to first td if no row header exists |
| 92 | + const cells = row.querySelectorAll('td'); |
| 93 | + if (cells.length > 0) { |
| 94 | + let text = cells[0].textContent.trim(); |
| 95 | + console.error(`Table ${tableIndex}, Row ${rowIndex} (fallback): "${text}"`); |
| 96 | + |
| 97 | + if (text && text.length > 0) { |
| 98 | + // Normalize model name: lowercase and replace spaces with dashes |
| 99 | + const normalizedName = text.toLowerCase().replace(/\s+/g, '-'); |
| 100 | + console.error(` Normalized: "${normalizedName}"`); |
| 101 | + modelNames.push(normalizedName); |
| 102 | + } |
| 103 | + } |
| 104 | + } |
| 105 | + }); |
| 106 | + }); |
| 107 | + |
| 108 | + // Remove duplicates |
| 109 | + return [...new Set(modelNames)]; |
| 110 | + }); |
| 111 | + |
| 112 | + // Save only the relevant section HTML for debugging |
| 113 | + const relevantHTML = await page.evaluate(() => { |
| 114 | + const headings = Array.from(document.querySelectorAll('h2, h3')); |
| 115 | + const targetHeading = headings.find(h => h.textContent.includes('Supported AI models in Copilot')); |
| 116 | + |
| 117 | + if (!targetHeading) { |
| 118 | + return '<p>Could not find target section</p>'; |
| 119 | + } |
| 120 | + |
| 121 | + let html = '<h2>' + targetHeading.textContent + '</h2>\n'; |
| 122 | + let currentElement = targetHeading.nextElementSibling; |
| 123 | + |
| 124 | + while (currentElement && currentElement.tagName !== 'H2') { |
| 125 | + html += currentElement.outerHTML + '\n'; |
| 126 | + currentElement = currentElement.nextElementSibling; |
| 127 | + } |
| 128 | + |
| 129 | + return html; |
| 130 | + }); |
| 131 | + |
| 132 | + fs.writeFileSync('page-content.html', relevantHTML); |
| 133 | + console.error('Saved relevant section HTML to page-content.html'); |
| 134 | + |
| 135 | + console.error(`Extracted ${models.length} unique models`); |
| 136 | + |
| 137 | + // Save models as JSON |
| 138 | + const modelsJson = JSON.stringify(models, null, 2); |
| 139 | + fs.writeFileSync('scraped-models.json', modelsJson); |
| 140 | + console.error('Saved scraped models to scraped-models.json'); |
| 141 | + |
| 142 | + // Output for the workflow |
| 143 | + console.log(JSON.stringify(models)); |
| 144 | + |
| 145 | + await browser.close(); |
| 146 | + } catch (error) { |
| 147 | + console.error('Error:', error.message); |
| 148 | + console.error('Stack trace:', error.stack); |
| 149 | + process.exit(1); |
| 150 | + } |
| 151 | +})(); |
| 152 | +SCRAPE_EOF |
| 153 | + |
| 154 | +echo "Running scraper..." |
| 155 | +node scrape.js 2>&1 | tee scraper.log |
| 156 | + |
| 157 | +# Extract the JSON output (last line) |
| 158 | +MODELS_JSON=$(tail -n 1 scraper.log) |
| 159 | +echo "Scraped models JSON: $MODELS_JSON" |
| 160 | + |
| 161 | +# Store the models, one per line |
| 162 | +echo "$MODELS_JSON" | jq -r '.[]' > models.txt |
| 163 | +echo "Models extracted to models.txt:" |
| 164 | +cat models.txt |
| 165 | + |
| 166 | +echo "Scraping complete!" |
0 commit comments