Skip to content

Commit 49d988b

Browse files
committed
Implement admin web scraping interface
1 parent 93fbe86 commit 49d988b

7 files changed

Lines changed: 12351 additions & 12398 deletions

File tree

backend/src/app.ts

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ import cors from 'cors';
33
import Fuse from 'fuse.js';
44
import morgan from 'morgan';
55
import { randomUUID } from 'crypto';
6+
import * as fs from 'fs';
7+
import * as path from 'path';
68
import {
79
Review,
810
Landlord,
@@ -26,6 +28,7 @@ import { auth } from 'firebase-admin';
2628
import { Timestamp } from '@firebase/firestore-types';
2729
import nodemailer from 'nodemailer';
2830
import axios from 'axios';
31+
import { runScrapers } from './scrapers';
2932
import { db, FieldValue, FieldPath } from './firebase-config';
3033
import { Faq } from './firebase-config/types';
3134
import authenticate from './auth';
@@ -1888,6 +1891,250 @@ app.post('/api/admin/migrate-all-apartments-schema', authenticate, async (req, r
18881891
}
18891892
});
18901893

1894+
// Web scraper and diffing endpoints
1895+
/**
1896+
* Normalizes an address string for fuzzy matching:
1897+
* lowercase, strip punctuation, remove "ithaca", zip codes, extra spaces.
1898+
*/
1899+
function normalizeAddress(addr: string): string {
1900+
return addr
1901+
.toLowerCase()
1902+
.replace(/[,.]/g, ' ')
1903+
.replace(/\bithaca\b/g, '')
1904+
.replace(/\bny\b/g, '')
1905+
.replace(/\b\d{5}\b/g, '')
1906+
.replace(/\s+/g, ' ')
1907+
.trim();
1908+
}
1909+
1910+
function escapeCSVField(value: unknown): string {
1911+
const str = value === null || value === undefined ? '' : String(value);
1912+
if (str.includes(',') || str.includes('"') || str.includes('\n')) {
1913+
return `"${str.replace(/"/g, '""')}"`;
1914+
}
1915+
return str;
1916+
}
1917+
1918+
/**
1919+
* Run Web Scraper + Diff - Triggers all registered agency scrapers, compares
1920+
* the results against the current Firestore buildings, and writes a diff CSV.
1921+
*
1922+
* @route POST /api/admin/run-scraper
1923+
*
1924+
* @input {string[]} [req.body.agencies] - Optional list of agency keys to run.
1925+
* Omit or pass "all" to run all registered scrapers.
1926+
*
1927+
* @status
1928+
* - 200: Scrape + diff complete; returns summary and marks csvReady: true
1929+
* - 401: Authentication failed
1930+
* - 403: Unauthorized - Admin access required
1931+
* - 500: Server error during scraping or diffing
1932+
*/
1933+
app.post('/api/admin/run-scraper', authenticate, async (req, res) => {
1934+
if (!req.user) throw new Error('Not authenticated');
1935+
1936+
const { email } = req.user;
1937+
if (!email || !admins.includes(email)) {
1938+
res.status(403).send('Unauthorized: Admin access required');
1939+
return;
1940+
}
1941+
1942+
try {
1943+
const agencies = req.body.agencies ?? 'all';
1944+
1945+
console.log('[run-scraper] Starting scrapers...');
1946+
const { results: scraped, errors: scraperErrors } = await runScrapers({ agencies });
1947+
console.log(`[run-scraper] Scraped ${scraped.length} properties.`);
1948+
1949+
// Fetch current buildings from db
1950+
const snapshot = await buildingsCollection.get();
1951+
const dbBuildings = snapshot.docs.map((doc) => ({
1952+
id: doc.id,
1953+
...doc.data(),
1954+
})) as Array<{
1955+
id: string;
1956+
name?: string;
1957+
address?: string;
1958+
numBeds?: number;
1959+
numBaths?: number;
1960+
price?: number;
1961+
}>;
1962+
1963+
const dbIndex = dbBuildings.map((b) => ({
1964+
...b,
1965+
normalized: normalizeAddress(b.address ?? ''),
1966+
}));
1967+
1968+
type DiffRow = {
1969+
status: 'NEW' | 'CHANGED' | 'UNCHANGED';
1970+
firestoreId: string;
1971+
dbName: string;
1972+
scrapedAddress: string;
1973+
numBedsScraped: string;
1974+
numBedsDb: string;
1975+
numBathsScraped: string;
1976+
numBathsDb: string;
1977+
priceScraped: string;
1978+
priceDb: string;
1979+
sourceUrl: string;
1980+
agency: string;
1981+
};
1982+
1983+
const diffRows: DiffRow[] = [];
1984+
let newCount = 0;
1985+
let changedCount = 0;
1986+
let unchangedCount = 0;
1987+
1988+
scraped.forEach((prop) => {
1989+
const normScraped = normalizeAddress(prop.address);
1990+
1991+
const match = dbIndex.find(
1992+
(b) => b.normalized.includes(normScraped) || normScraped.includes(b.normalized)
1993+
);
1994+
1995+
const numBedsScraped = prop.numBeds !== null ? String(prop.numBeds) : '';
1996+
const numBathsScraped = prop.numBaths !== null ? String(prop.numBaths) : '';
1997+
const priceScraped = prop.price !== null ? String(prop.price) : '';
1998+
1999+
if (!match) {
2000+
newCount += 1;
2001+
diffRows.push({
2002+
status: 'NEW',
2003+
firestoreId: '',
2004+
dbName: '',
2005+
scrapedAddress: prop.address,
2006+
numBedsScraped,
2007+
numBedsDb: '',
2008+
numBathsScraped,
2009+
numBathsDb: '',
2010+
priceScraped,
2011+
priceDb: '',
2012+
sourceUrl: prop.sourceUrl,
2013+
agency: prop.agency,
2014+
});
2015+
} else {
2016+
const numBedsDb = match.numBeds !== undefined ? String(match.numBeds) : '';
2017+
const numBathsDb = match.numBaths !== undefined ? String(match.numBaths) : '';
2018+
const priceDb = match.price !== undefined ? String(match.price) : '';
2019+
2020+
const changed =
2021+
(numBedsScraped !== '' && numBedsScraped !== numBedsDb) ||
2022+
(numBathsScraped !== '' && numBathsScraped !== numBathsDb) ||
2023+
(priceScraped !== '' && priceScraped !== priceDb);
2024+
2025+
if (changed) {
2026+
changedCount += 1;
2027+
} else {
2028+
unchangedCount += 1;
2029+
}
2030+
2031+
diffRows.push({
2032+
status: changed ? 'CHANGED' : 'UNCHANGED',
2033+
firestoreId: match.id,
2034+
dbName: match.name ?? '',
2035+
scrapedAddress: prop.address,
2036+
numBedsScraped,
2037+
numBedsDb,
2038+
numBathsScraped,
2039+
numBathsDb,
2040+
priceScraped,
2041+
priceDb,
2042+
sourceUrl: prop.sourceUrl,
2043+
agency: prop.agency,
2044+
});
2045+
}
2046+
});
2047+
2048+
// Write diff CSV
2049+
const CSV_HEADERS = [
2050+
'status',
2051+
'firestoreId',
2052+
'dbName',
2053+
'scrapedAddress',
2054+
'numBedsScraped',
2055+
'numBedsDb',
2056+
'numBathsScraped',
2057+
'numBathsDb',
2058+
'priceScraped',
2059+
'priceDb',
2060+
'sourceUrl',
2061+
'agency',
2062+
];
2063+
2064+
const csvLines = [
2065+
CSV_HEADERS.join(','),
2066+
...diffRows.map((row) =>
2067+
[
2068+
row.status,
2069+
row.firestoreId,
2070+
row.dbName,
2071+
row.scrapedAddress,
2072+
row.numBedsScraped,
2073+
row.numBedsDb,
2074+
row.numBathsScraped,
2075+
row.numBathsDb,
2076+
row.priceScraped,
2077+
row.priceDb,
2078+
row.sourceUrl,
2079+
row.agency,
2080+
]
2081+
.map(escapeCSVField)
2082+
.join(',')
2083+
),
2084+
];
2085+
2086+
const csvPath = path.join(__dirname, '../scripts/scraper_diff.csv');
2087+
fs.writeFileSync(csvPath, csvLines.join('\n'), 'utf8');
2088+
console.log(`[run-scraper] Diff CSV written to ${csvPath}`);
2089+
2090+
res.status(200).json({
2091+
total: scraped.length,
2092+
newCount,
2093+
changedCount,
2094+
unchangedCount,
2095+
scraperErrors,
2096+
csvReady: true,
2097+
});
2098+
} catch (err) {
2099+
console.error('[run-scraper] Error:', err);
2100+
res.status(500).send(`Scraper error: ${err instanceof Error ? err.message : 'Unknown error'}`);
2101+
}
2102+
});
2103+
2104+
/**
2105+
* Download Scraper Diff CSV - Returns the last scraper_diff.csv written by
2106+
* POST /api/admin/run-scraper.
2107+
*
2108+
* @route GET /api/admin/scraper-results.csv
2109+
*
2110+
* @status
2111+
* - 200: CSV file download
2112+
* - 401: Authentication failed
2113+
* - 403: Unauthorized - Admin access required
2114+
* - 404: No scraper results found — run the scraper first
2115+
* - 500: Server error
2116+
*/
2117+
app.get('/api/admin/scraper-results.csv', authenticate, async (req, res) => {
2118+
if (!req.user) throw new Error('Not authenticated');
2119+
2120+
const { email } = req.user;
2121+
if (!email || !admins.includes(email)) {
2122+
res.status(403).send('Unauthorized: Admin access required');
2123+
return;
2124+
}
2125+
2126+
const csvPath = path.join(__dirname, '../scripts/scraper_diff.csv');
2127+
2128+
if (!fs.existsSync(csvPath)) {
2129+
res.status(404).send('No scraper results found. Run POST /api/admin/run-scraper first.');
2130+
return;
2131+
}
2132+
2133+
res.setHeader('Content-Type', 'text/csv');
2134+
res.setHeader('Content-Disposition', 'attachment; filename="scraper_diff.csv"');
2135+
fs.createReadStream(csvPath).pipe(res);
2136+
});
2137+
18912138
/**
18922139
* Update Pending Building Status - Updates the status of a pending building report.
18932140
*

frontend/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
"react-router-dom": "^5.2.0",
3030
"react-scripts": "4.0.0",
3131
"sass": "^1.45.0",
32-
"typescript": "~4.0.5",
32+
"typescript": "4.9.5",
3333
"web-vitals": "^0.2.4"
3434
},
3535
"scripts": {

frontend/src/components/LeaveReview/ReviewModal.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ const ReviewModal = ({
288288
return;
289289
}
290290

291-
const newFiles = [...files].slice(0, availablePhotos);
291+
const newFiles = Array.from(files).slice(0, availablePhotos);
292292
const bigPhoto = newFiles.find(
293293
(newFiles) => newFiles.size > REVIEW_PHOTO_MAX_MB * Math.pow(1024, 2)
294294
);

frontend/src/components/utils/Footer/ContactModal.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ const ContactModal = ({ user }: Props) => {
292292
return;
293293
}
294294

295-
const newFiles = [...files].slice(0, availablePhotos);
295+
const newFiles = Array.from(files).slice(0, availablePhotos);
296296
const bigPhoto = newFiles.find((newFiles) => newFiles.size > PHOTO_MAX_MB * Math.pow(1024, 2));
297297
if (bigPhoto) {
298298
console.log(`File ${bigPhoto.name} exceeds max size of ${PHOTO_MAX_MB}`);

0 commit comments

Comments
 (0)