@@ -3,6 +3,8 @@ import cors from 'cors';
33import Fuse from 'fuse.js' ;
44import morgan from 'morgan' ;
55import { randomUUID } from 'crypto' ;
6+ import * as fs from 'fs' ;
7+ import * as path from 'path' ;
68import {
79 Review ,
810 Landlord ,
@@ -26,6 +28,7 @@ import { auth } from 'firebase-admin';
2628import { Timestamp } from '@firebase/firestore-types' ;
2729import nodemailer from 'nodemailer' ;
2830import axios from 'axios' ;
31+ import { runScrapers } from './scrapers' ;
2932import { db , FieldValue , FieldPath } from './firebase-config' ;
3033import { Faq } from './firebase-config/types' ;
3134import authenticate from './auth' ;
@@ -1888,6 +1891,250 @@ app.post('/api/admin/migrate-all-apartments-schema', authenticate, async (req, r
18881891 }
18891892} ) ;
18901893
1894+ // Web scraper and diffing endpoints
1895+ /**
1896+ * Normalizes an address string for fuzzy matching:
1897+ * lowercase, strip punctuation, remove "ithaca", zip codes, extra spaces.
1898+ */
1899+ function normalizeAddress ( addr : string ) : string {
1900+ return addr
1901+ . toLowerCase ( )
1902+ . replace ( / [ , . ] / g, ' ' )
1903+ . replace ( / \b i t h a c a \b / g, '' )
1904+ . replace ( / \b n y \b / g, '' )
1905+ . replace ( / \b \d { 5 } \b / g, '' )
1906+ . replace ( / \s + / g, ' ' )
1907+ . trim ( ) ;
1908+ }
1909+
1910+ function escapeCSVField ( value : unknown ) : string {
1911+ const str = value === null || value === undefined ? '' : String ( value ) ;
1912+ if ( str . includes ( ',' ) || str . includes ( '"' ) || str . includes ( '\n' ) ) {
1913+ return `"${ str . replace ( / " / g, '""' ) } "` ;
1914+ }
1915+ return str ;
1916+ }
1917+
1918+ /**
1919+ * Run Web Scraper + Diff - Triggers all registered agency scrapers, compares
1920+ * the results against the current Firestore buildings, and writes a diff CSV.
1921+ *
1922+ * @route POST /api/admin/run-scraper
1923+ *
1924+ * @input {string[]} [req.body.agencies] - Optional list of agency keys to run.
1925+ * Omit or pass "all" to run all registered scrapers.
1926+ *
1927+ * @status
1928+ * - 200: Scrape + diff complete; returns summary and marks csvReady: true
1929+ * - 401: Authentication failed
1930+ * - 403: Unauthorized - Admin access required
1931+ * - 500: Server error during scraping or diffing
1932+ */
1933+ app . post ( '/api/admin/run-scraper' , authenticate , async ( req , res ) => {
1934+ if ( ! req . user ) throw new Error ( 'Not authenticated' ) ;
1935+
1936+ const { email } = req . user ;
1937+ if ( ! email || ! admins . includes ( email ) ) {
1938+ res . status ( 403 ) . send ( 'Unauthorized: Admin access required' ) ;
1939+ return ;
1940+ }
1941+
1942+ try {
1943+ const agencies = req . body . agencies ?? 'all' ;
1944+
1945+ console . log ( '[run-scraper] Starting scrapers...' ) ;
1946+ const { results : scraped , errors : scraperErrors } = await runScrapers ( { agencies } ) ;
1947+ console . log ( `[run-scraper] Scraped ${ scraped . length } properties.` ) ;
1948+
1949+ // Fetch current buildings from db
1950+ const snapshot = await buildingsCollection . get ( ) ;
1951+ const dbBuildings = snapshot . docs . map ( ( doc ) => ( {
1952+ id : doc . id ,
1953+ ...doc . data ( ) ,
1954+ } ) ) as Array < {
1955+ id : string ;
1956+ name ?: string ;
1957+ address ?: string ;
1958+ numBeds ?: number ;
1959+ numBaths ?: number ;
1960+ price ?: number ;
1961+ } > ;
1962+
1963+ const dbIndex = dbBuildings . map ( ( b ) => ( {
1964+ ...b ,
1965+ normalized : normalizeAddress ( b . address ?? '' ) ,
1966+ } ) ) ;
1967+
1968+ type DiffRow = {
1969+ status : 'NEW' | 'CHANGED' | 'UNCHANGED' ;
1970+ firestoreId : string ;
1971+ dbName : string ;
1972+ scrapedAddress : string ;
1973+ numBedsScraped : string ;
1974+ numBedsDb : string ;
1975+ numBathsScraped : string ;
1976+ numBathsDb : string ;
1977+ priceScraped : string ;
1978+ priceDb : string ;
1979+ sourceUrl : string ;
1980+ agency : string ;
1981+ } ;
1982+
1983+ const diffRows : DiffRow [ ] = [ ] ;
1984+ let newCount = 0 ;
1985+ let changedCount = 0 ;
1986+ let unchangedCount = 0 ;
1987+
1988+ scraped . forEach ( ( prop ) => {
1989+ const normScraped = normalizeAddress ( prop . address ) ;
1990+
1991+ const match = dbIndex . find (
1992+ ( b ) => b . normalized . includes ( normScraped ) || normScraped . includes ( b . normalized )
1993+ ) ;
1994+
1995+ const numBedsScraped = prop . numBeds !== null ? String ( prop . numBeds ) : '' ;
1996+ const numBathsScraped = prop . numBaths !== null ? String ( prop . numBaths ) : '' ;
1997+ const priceScraped = prop . price !== null ? String ( prop . price ) : '' ;
1998+
1999+ if ( ! match ) {
2000+ newCount += 1 ;
2001+ diffRows . push ( {
2002+ status : 'NEW' ,
2003+ firestoreId : '' ,
2004+ dbName : '' ,
2005+ scrapedAddress : prop . address ,
2006+ numBedsScraped,
2007+ numBedsDb : '' ,
2008+ numBathsScraped,
2009+ numBathsDb : '' ,
2010+ priceScraped,
2011+ priceDb : '' ,
2012+ sourceUrl : prop . sourceUrl ,
2013+ agency : prop . agency ,
2014+ } ) ;
2015+ } else {
2016+ const numBedsDb = match . numBeds !== undefined ? String ( match . numBeds ) : '' ;
2017+ const numBathsDb = match . numBaths !== undefined ? String ( match . numBaths ) : '' ;
2018+ const priceDb = match . price !== undefined ? String ( match . price ) : '' ;
2019+
2020+ const changed =
2021+ ( numBedsScraped !== '' && numBedsScraped !== numBedsDb ) ||
2022+ ( numBathsScraped !== '' && numBathsScraped !== numBathsDb ) ||
2023+ ( priceScraped !== '' && priceScraped !== priceDb ) ;
2024+
2025+ if ( changed ) {
2026+ changedCount += 1 ;
2027+ } else {
2028+ unchangedCount += 1 ;
2029+ }
2030+
2031+ diffRows . push ( {
2032+ status : changed ? 'CHANGED' : 'UNCHANGED' ,
2033+ firestoreId : match . id ,
2034+ dbName : match . name ?? '' ,
2035+ scrapedAddress : prop . address ,
2036+ numBedsScraped,
2037+ numBedsDb,
2038+ numBathsScraped,
2039+ numBathsDb,
2040+ priceScraped,
2041+ priceDb,
2042+ sourceUrl : prop . sourceUrl ,
2043+ agency : prop . agency ,
2044+ } ) ;
2045+ }
2046+ } ) ;
2047+
2048+ // Write diff CSV
2049+ const CSV_HEADERS = [
2050+ 'status' ,
2051+ 'firestoreId' ,
2052+ 'dbName' ,
2053+ 'scrapedAddress' ,
2054+ 'numBedsScraped' ,
2055+ 'numBedsDb' ,
2056+ 'numBathsScraped' ,
2057+ 'numBathsDb' ,
2058+ 'priceScraped' ,
2059+ 'priceDb' ,
2060+ 'sourceUrl' ,
2061+ 'agency' ,
2062+ ] ;
2063+
2064+ const csvLines = [
2065+ CSV_HEADERS . join ( ',' ) ,
2066+ ...diffRows . map ( ( row ) =>
2067+ [
2068+ row . status ,
2069+ row . firestoreId ,
2070+ row . dbName ,
2071+ row . scrapedAddress ,
2072+ row . numBedsScraped ,
2073+ row . numBedsDb ,
2074+ row . numBathsScraped ,
2075+ row . numBathsDb ,
2076+ row . priceScraped ,
2077+ row . priceDb ,
2078+ row . sourceUrl ,
2079+ row . agency ,
2080+ ]
2081+ . map ( escapeCSVField )
2082+ . join ( ',' )
2083+ ) ,
2084+ ] ;
2085+
2086+ const csvPath = path . join ( __dirname , '../scripts/scraper_diff.csv' ) ;
2087+ fs . writeFileSync ( csvPath , csvLines . join ( '\n' ) , 'utf8' ) ;
2088+ console . log ( `[run-scraper] Diff CSV written to ${ csvPath } ` ) ;
2089+
2090+ res . status ( 200 ) . json ( {
2091+ total : scraped . length ,
2092+ newCount,
2093+ changedCount,
2094+ unchangedCount,
2095+ scraperErrors,
2096+ csvReady : true ,
2097+ } ) ;
2098+ } catch ( err ) {
2099+ console . error ( '[run-scraper] Error:' , err ) ;
2100+ res . status ( 500 ) . send ( `Scraper error: ${ err instanceof Error ? err . message : 'Unknown error' } ` ) ;
2101+ }
2102+ } ) ;
2103+
2104+ /**
2105+ * Download Scraper Diff CSV - Returns the last scraper_diff.csv written by
2106+ * POST /api/admin/run-scraper.
2107+ *
2108+ * @route GET /api/admin/scraper-results.csv
2109+ *
2110+ * @status
2111+ * - 200: CSV file download
2112+ * - 401: Authentication failed
2113+ * - 403: Unauthorized - Admin access required
2114+ * - 404: No scraper results found — run the scraper first
2115+ * - 500: Server error
2116+ */
2117+ app . get ( '/api/admin/scraper-results.csv' , authenticate , async ( req , res ) => {
2118+ if ( ! req . user ) throw new Error ( 'Not authenticated' ) ;
2119+
2120+ const { email } = req . user ;
2121+ if ( ! email || ! admins . includes ( email ) ) {
2122+ res . status ( 403 ) . send ( 'Unauthorized: Admin access required' ) ;
2123+ return ;
2124+ }
2125+
2126+ const csvPath = path . join ( __dirname , '../scripts/scraper_diff.csv' ) ;
2127+
2128+ if ( ! fs . existsSync ( csvPath ) ) {
2129+ res . status ( 404 ) . send ( 'No scraper results found. Run POST /api/admin/run-scraper first.' ) ;
2130+ return ;
2131+ }
2132+
2133+ res . setHeader ( 'Content-Type' , 'text/csv' ) ;
2134+ res . setHeader ( 'Content-Disposition' , 'attachment; filename="scraper_diff.csv"' ) ;
2135+ fs . createReadStream ( csvPath ) . pipe ( res ) ;
2136+ } ) ;
2137+
18912138/**
18922139 * Update Pending Building Status - Updates the status of a pending building report.
18932140 *
0 commit comments