Skip to content

Commit a848501

Browse files
authored
chore: Generate candidate feed list to use with the validator acceptance tests (#1583)
1 parent 5eb1a47 commit a848501

3 files changed

Lines changed: 129 additions & 0 deletions

File tree

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Creating the list of feeds used for validator acceptance tests
2+
3+
- Initially the gtfs-validator acceptance tests (See [acceptance_test.yml](https://github.com/MobilityData/gtfs-validator/blob/master/.github/workflows/acceptance_test.yml)) would use all the feeds from the [mobility-database-catalog](https://github.com/MobilityData/mobility-database-catalogs). Over time this proved to be too time consuming so a curated list of feeds was created to speed up the tests while still providing good coverage.
4+
5+
- The curated list of feeds is obtained using a query on the [Mobility Database](https://mobilitydatabase.org/).
6+
7+
- The query is in create_list.sql.
8+
9+
- You can use the create_acceptance_list.py script to generate the list. From the mobility-feed-api base directory, you can:
10+
- `cd scripts/validator-acceptance-feed-list`
11+
- `cp ../../config/.env.local .env`
12+
- Edit .env to point to the DB you want to query (usually the prod DB). You will probably have to tunnel to the DB (See [tunnel-create.sh](https://github.com/MobilityData/mobility-feed-api/blob/main/scripts/tunnel-create.sh)).
13+
- Make sure python is available (Suggestion is the use the same virtual environment as the api)
14+
- Execute the script: `python create_acceptance_list.py --env-file .env`
15+
- The list will be created in `acceptance_test_feed_list.csv`
16+
17+
If you have access to the [MobilityData Metabase site](https://metabase.mobilitydatabase.org/), a simpler way to get the list is to create a question using the query.
18+
19+
Once the list is created, it should be committed to the gtfs-validator repository, in [acceptance_test_feed_list.csv](https://github.com/MobilityData/gtfs-validator/blob/master/scripts/mobility-database-harvester/acceptance_test_feed_list.csv) where it is used by the gtfs-validator acceptance tests.
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# This script connects to a PostgreSQL database using credentials from a .env file,
2+
# executes a SQL query from create_list.sql, and writes the results to a CSV file.
3+
# It is intended to generate a validator acceptance list for GTFS feeds.
4+
5+
import os
6+
import csv
7+
import psycopg2
8+
import argparse
9+
from dotenv import load_dotenv
10+
11+
# Parse command-line arguments for the --env-file parameter
12+
# This allows the user to specify which .env file to use for DB credentials
13+
def parse_args():
14+
parser = argparse.ArgumentParser(description="Create validator acceptance list CSV from DB query.")
15+
parser.add_argument('--env-file', default='config/.env.local', help='Path to .env file (default: config/.env.local)')
16+
return parser.parse_args()
17+
18+
# Define the paths for the SQL query file and the output CSV file
19+
SQL_FILE = os.path.join(os.path.dirname(__file__), 'create_list.sql')
20+
CSV_FILE = os.path.join(os.path.dirname(__file__), 'acceptance_test_feed_list.csv')
21+
22+
def main():
23+
args = parse_args()
24+
# Load environment variables from the specified env file
25+
load_dotenv(args.env_file)
26+
27+
# Read PostgreSQL connection parameters from environment variables
28+
DB_HOST = os.getenv('POSTGRES_HOST')
29+
DB_PORT = os.getenv('POSTGRES_PORT')
30+
DB_NAME = os.getenv('POSTGRES_DB')
31+
DB_USER = os.getenv('POSTGRES_USER')
32+
DB_PASS = os.getenv('POSTGRES_PASSWORD')
33+
34+
# Print DB connection variables (except password) for debugging
35+
print(f"Connecting to PostgreSQL with:")
36+
print(f" HOST: {DB_HOST}")
37+
print(f" PORT: {DB_PORT}")
38+
print(f" DB: {DB_NAME}")
39+
print(f" USER: {DB_USER}")
40+
41+
# Read SQL query from file
42+
with open(SQL_FILE, 'r') as f:
43+
query = f.read()
44+
45+
# Connect to PostgreSQL and execute the query
46+
conn = psycopg2.connect(
47+
host=DB_HOST,
48+
port=DB_PORT,
49+
dbname=DB_NAME,
50+
user=DB_USER,
51+
password=DB_PASS
52+
)
53+
cur = conn.cursor()
54+
# Enforce read-only session for extra safety
55+
cur.execute("SET SESSION CHARACTERISTICS AS TRANSACTION READ ONLY;")
56+
cur.execute(query)
57+
rows = cur.fetchall()
58+
headers = [desc[0] for desc in cur.description]
59+
60+
# Write results to CSV file
61+
with open(CSV_FILE, 'w', newline='') as csvfile:
62+
writer = csv.writer(csvfile)
63+
writer.writerow(headers)
64+
writer.writerows(rows)
65+
66+
# Clean up DB connection
67+
cur.close()
68+
conn.close()
69+
70+
if __name__ == '__main__':
71+
main()
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
SELECT DISTINCT ON (f.stable_id)
2+
f.stable_id, -- Only stable_id and urls.latest are necessary in the resulting list.
3+
-- the others are for informational purposes
4+
f.provider,
5+
l.country_code,
6+
l.subdivision_name,
7+
l.municipality,
8+
CONCAT('https://files.mobilitydatabase.org/', f.stable_id, '/latest.zip') AS "urls.latest"
9+
FROM feed AS f
10+
JOIN gtfsfeed AS gf ON gf.id = f.id
11+
JOIN locationfeed AS lf ON lf.feed_id = f.id
12+
JOIN location AS l ON l.id = lf.location_id
13+
WHERE f.data_type = 'gtfs'
14+
AND f.stable_id LIKE 'mdb-%'
15+
AND f.status <> 'deprecated'
16+
AND f.operational_status = 'published'
17+
AND f.stable_id NOT IN ( -- Exclude specific feeds because they take too long
18+
'mdb-784',
19+
'mdb-1081',
20+
'mdb-1078'
21+
)
22+
AND gf.latest_dataset_id IS NOT NULL
23+
AND (
24+
l.country_code <> 'US'
25+
OR l.country_code IS NULL
26+
OR f.provider ILIKE 'Chicago Transit Authority%'
27+
OR l.subdivision_name ILIKE 'California%'
28+
OR l.subdivision_name ILIKE 'New York%'
29+
OR f.provider ILIKE 'Miami-Dade Transit%'
30+
OR f.stable_id IN ( -- Cover specific notices
31+
'mdb-2164', -- To cover invalid_geometry notice
32+
'mdb-2447', -- To cover invalid_pickup_drop_off_window and missing_pickup_drop_off_booking_rule_id notices
33+
'mdb-2446', -- To cover missing_pickup_or_drop_off_window notice
34+
'mdb-2165', -- To cover missing_prior_day_booking_field_value (validator 6 only) and missing_prior_notice_last_time notices
35+
'mdb-2831', -- To cover overlapping_zone_and_pickup_drop_off_window notice
36+
'mdb-2882' -- To cover forbidden_shape_dist_traveled notice
37+
)
38+
)
39+
ORDER BY f.stable_id;

0 commit comments

Comments
 (0)