Skip to content

Commit 5713053

Browse files
authored
Merge pull request #623 from brownplt/shareurl-proxy
Add /load-shareurl proxy for hosts blocked on some school networks
2 parents 9020176 + a2ca9d8 commit 5713053

4 files changed

Lines changed: 246 additions & 16 deletions

File tree

package-lock.json

Lines changed: 22 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
"react-dom": "^15.7.0",
5656
"redis": "^0.10.3",
5757
"request": "^2.88.2",
58+
"request-filtering-agent": "^3.2.0",
5859
"requirejs": "2.1.14",
5960
"s-expression": "~2.2.0",
6061
"script-loader": "^0.7.2",

src/server.js

Lines changed: 99 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,18 @@ const { drive } = require("googleapis/build/src/apis/drive/index.js");
77

88
var BACKREF_KEY = "originalProgram";
99

10+
// Limits for the streaming proxy. /downloadImg gets larger/looser caps because
11+
// images can legitimately be tens of MB; also we've seen e.g. Drive ?export=
12+
// take a while to get going. SHAREURL is intended to always be program
13+
// plaintext.
14+
// NOTE(joe + claude): really the timeout maybe should be on idleness at
15+
// startup/between bytes, not overall per completed request, but that's work to
16+
// plumb into `request`
17+
var IMAGE_PROXY_MAX_BYTES = 20 * 1024 * 1024; // 20 MB
18+
var IMAGE_PROXY_TIMEOUT_MS = 30 * 1000; // 30 s
19+
var SHAREURL_PROXY_MAX_BYTES = 1 * 1024 * 1024; // 1 MB
20+
var SHAREURL_PROXY_TIMEOUT_MS = 10 * 1000; // 10 s
21+
1022
function start(config, onServerReady) {
1123
var defaultOpts = {
1224
PYRET: process.env.PYRET,
@@ -27,6 +39,7 @@ function start(config, onServerReady) {
2739
var csrf = require('csurf');
2840
var googleAuth = require('./google-auth.js');
2941
var request = require('request');
42+
var requestFilteringAgent = require('request-filtering-agent');
3043
var mustache = require('mustache-express');
3144
var url = require('url');
3245
var fs = require('fs');
@@ -186,24 +199,74 @@ function start(config, onServerReady) {
186199
});
187200
}
188201

189-
app.get("/downloadImg", function(req, response) {
190-
var parsed = url.parse(req.url);
191-
var googleLink = decodeURIComponent(parsed.query.slice(0));
192-
var googleParsed = url.parse(googleLink);
193-
var gReq = request({url: googleLink, encoding: 'binary'}, function(error, imgResponse, body) {
194-
if(error) {
195-
response.status(400).send({type: "image-load-failure", error: "Unable to load image " + String(error)});
202+
function proxyStreamFetch(opts) {
203+
var res = opts.res;
204+
res.set('X-Content-Type-Options', 'nosniff');
205+
res.set('Content-Security-Policy', 'sandbox');
206+
207+
var parsed;
208+
try { parsed = new URL(opts.url); }
209+
catch (e) { return res.status(400).send({ error: 'invalid-url' }); }
210+
if (opts.allowedHosts && !opts.allowedHosts(parsed.hostname)) {
211+
return res.status(400).send({ error: 'host-not-allowed' });
212+
}
213+
214+
var bytes = 0;
215+
var upstream = request({
216+
url: opts.url,
217+
timeout: opts.timeoutMs,
218+
agent: requestFilteringAgent.useAgent(opts.url),
219+
followRedirect: function(resp) {
220+
if (!opts.allowedHosts) return true;
221+
try {
222+
var next = new URL(resp.headers.location, opts.url);
223+
return opts.allowedHosts(next.hostname);
224+
} catch (_) { return false; }
225+
},
226+
});
227+
// If the client disconnects (e.g. the browser aborts /load-shareurl after
228+
// direct succeeded), tear down the upstream connection too — otherwise
229+
// we'd keep streaming bytes from raw.githubusercontent.com to nowhere.
230+
res.on('close', function() { upstream.destroy(); });
231+
upstream.on('error', function(err) {
232+
if (!res.headersSent) opts.onError(res, err);
233+
});
234+
upstream.on('response', function(upRes) {
235+
if (opts.contentTypeOk && !opts.contentTypeOk(upRes.headers['content-type'])) {
236+
upstream.destroy();
237+
return res.status(400).send({ error: 'content-type-not-allowed', detail: upRes.headers['content-type'] });
196238
}
197-
else {
198-
var h = imgResponse.headers;
199-
var ct = h['content-type'];
200-
if((!ct) || (ct.indexOf('image/') !== 0)) {
201-
response.status(400).send({type: "non-image", error: "Invalid image type " + ct});
202-
return;
203-
}
204-
response.set('content-type', ct);
205-
response.end(body, 'binary');
239+
res.status(upRes.statusCode);
240+
if (upRes.headers['content-type']) {
241+
res.set('content-type', upRes.headers['content-type']);
206242
}
243+
upRes.on('data', function(chunk) {
244+
bytes += chunk.length;
245+
if (bytes > opts.maxBytes) {
246+
upstream.destroy();
247+
if (!res.headersSent) res.status(502).send({ error: 'too-large' });
248+
else res.destroy();
249+
}
250+
});
251+
// Pipe upRes (IncomingMessage), not upstream (request object). The
252+
// request library's .pipe copies upstream headers verbatim, which
253+
// would overwrite the security headers set above.
254+
upRes.pipe(res);
255+
});
256+
}
257+
258+
app.get("/downloadImg", function(req, response) {
259+
var googleLink = decodeURIComponent(url.parse(req.url).query.slice(0));
260+
proxyStreamFetch({
261+
res: response,
262+
url: googleLink,
263+
allowedHosts: null,
264+
maxBytes: IMAGE_PROXY_MAX_BYTES,
265+
timeoutMs: IMAGE_PROXY_TIMEOUT_MS,
266+
contentTypeOk: function(ct) { return ct && ct.indexOf('image/') === 0; },
267+
onError: function(res, err) {
268+
res.status(400).send({ type: 'image-load-failure', error: 'Unable to load image ' + String(err) });
269+
},
207270
});
208271
});
209272

@@ -565,6 +628,26 @@ function start(config, onServerReady) {
565628

566629
});
567630

631+
// Server-side proxy for #shareurl loads from hosts that some school networks
632+
// block or will likely block (notably raw.githubusercontent.com).
633+
// Eager-proxied client-side for any URL whose host is in
634+
// SHAREURL_ALLOWED_HOSTS. We can expand this list as needed.
635+
var SHAREURL_ALLOWED_HOSTS = new Set(['raw.githubusercontent.com']);
636+
637+
app.get("/load-shareurl", function(req, res) {
638+
proxyStreamFetch({
639+
res: res,
640+
url: req.query.url,
641+
allowedHosts: function(h) { return SHAREURL_ALLOWED_HOSTS.has(h); },
642+
maxBytes: SHAREURL_PROXY_MAX_BYTES,
643+
timeoutMs: SHAREURL_PROXY_TIMEOUT_MS,
644+
contentTypeOk: null,
645+
onError: function(res, err) {
646+
res.status(502).send({ error: 'upstream-error' });
647+
},
648+
});
649+
});
650+
568651

569652
app.post("/share-image", function(req, res) {
570653
var driveFileId = req.body.fileId;

src/web/js/beforePyret.js

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,130 @@
33
var originalPageLoad = Date.now();
44
console.log("originalPageLoad: ", originalPageLoad);
55

6+
// Transparently route browser fetches to allowlisted hosts through the
7+
// server-side proxy at /load-shareurl, but only when the direct path doesn't
8+
// work.
9+
//
10+
// Strategy: the FIRST fetch to an allowlisted host fires direct + proxied in
11+
// parallel. We decide shouldProxy for the rest of the page-load from direct's
12+
// response *headers*:
13+
// - direct returned 2xx with content-type text/plain -> shouldProxy=false:
14+
// serve direct's response, abort the in-flight proxy fetch.
15+
// - direct failed, hung past timeout, or returned anything else
16+
// -> shouldProxy=true:
17+
// serve proxy's response.
18+
// A key idea is that network-blocky things sometimes return 200 with a
19+
// message page about blocking (or an error, but that counts as a fail). We
20+
// don't want to accidentally think that's a success.
21+
// shouldProxy state is in-memory and per-host — never persisted, since
22+
// reachability changes between networks and a stale value would silently
23+
// break loads.
24+
//
25+
// Installed on the global fetch as early as possible so it catches every fetch
26+
// caller; some of them are in the pyret-lang runtime and would be otherwise
27+
// difficult to configure.
28+
const SHAREURL_PROXY_HOSTS = new Set(['raw.githubusercontent.com']);
29+
const SHAREURL_DIRECT_TIMEOUT_MS = 5000;
30+
const _origFetch = window.fetch.bind(window);
31+
32+
const _shareurlShouldProxy = new Map(); // host -> boolean
33+
const _shareurlShouldProxyInflight = new Map(); // host -> Promise<boolean>
34+
35+
function _shareurlProxyUrl(fetchInput) {
36+
return '/load-shareurl?url=' + encodeURIComponent(_shareurlInputToUrl(fetchInput));
37+
}
38+
39+
function _shareurlInputToUrl(fetchInput) {
40+
return (typeof fetchInput === 'string') ? fetchInput
41+
: (typeof Request !== 'undefined' && fetchInput instanceof Request) ? fetchInput.url
42+
: String(fetchInput);
43+
}
44+
45+
function _shareurlVerifyDirect(r) {
46+
if (!r.ok) return false;
47+
const ct = (r.headers.get('content-type') || '').toLowerCase();
48+
// Source files served from raw.githubusercontent.com come back as
49+
// text/plain (.arr, .json, .csv, .md all do). Anything else — HTML block
50+
// pages, captive portals, surprise content types — we don't trust as a
51+
// real upstream response.
52+
return ct.startsWith('text/plain');
53+
}
54+
55+
function _shareurlFetch(shouldProxy, fetchInput, fetchInit) {
56+
const maybeProxyInput = shouldProxy ? _shareurlProxyUrl(fetchInput) : fetchInput;
57+
return _origFetch(maybeProxyInput, fetchInit);
58+
}
59+
60+
function _shareurlRace(fetchInput, fetchInit) {
61+
const proxyCtrl = new AbortController();
62+
// NOTE(joe): The signal overwrite is technically not the right fetch()
63+
// polyfill. If the caller elsewhere in the codebase provided a different
64+
// signal (which in the fetch API is only for aborting as of April '26), that
65+
// caller aborting through that signal won't cancel the proxy fetch.
66+
// I'm OK letting that case slip through here in exchange for not having a
67+
// bunch of extra event handler forwarding
68+
const proxyP = _origFetch(_shareurlProxyUrl(fetchInput),
69+
Object.assign({}, fetchInit, { signal: proxyCtrl.signal }));
70+
const directP = _origFetch(fetchInput, fetchInit).then(r => {
71+
if (!_shareurlVerifyDirect(r)) throw new Error('direct request failed');
72+
return r;
73+
});
74+
75+
// shouldProxy: false iff direct verified before the timeout, else true.
76+
// Whether to proxy is decided solely on whether direct succeeds or not
77+
const shouldProxyPromise = Promise.race([
78+
directP.then(() => false, () => true),
79+
new Promise(resolve => setTimeout(() => resolve(true), SHAREURL_DIRECT_TIMEOUT_MS)),
80+
]);
81+
82+
// Settlement-order check: if direct verifies before proxy returns, abort
83+
// the in-flight proxy to stop wasting server bandwidth. We must NOT
84+
// abort once proxy has already returned, since by then the caller is
85+
// reading proxy's body and aborting would error its stream mid-read.
86+
const directFinishedSuccessfullyAndFirstP = Promise.race([
87+
directP.then(() => true, () => false),
88+
proxyP.then(() => false, () => false),
89+
]);
90+
directFinishedSuccessfullyAndFirstP.then(directFirst => {
91+
if (directFirst) proxyCtrl.abort();
92+
});
93+
94+
// Caller's response: whichever of direct-verified or proxy fulfills
95+
// first. If both fail, surface proxy's error (the more authoritative
96+
// upstream — direct's may just be 'direct-not-verified').
97+
const responsePromise = Promise.any([directP, proxyP]).catch(
98+
aggErr => Promise.reject(aggErr.errors[1] || aggErr.errors[0])
99+
);
100+
101+
return { responsePromise, shouldProxyPromise };
102+
}
103+
104+
window.fetch = function(fetchInput, fetchInit) {
105+
let host;
106+
try { host = new URL(_shareurlInputToUrl(fetchInput), window.location.href).hostname; }
107+
catch (_) { return _origFetch(fetchInput, fetchInit); }
108+
if (!SHAREURL_PROXY_HOSTS.has(host)) return _origFetch(fetchInput, fetchInit);
109+
110+
const shouldProxy = _shareurlShouldProxy.get(host);
111+
const inflight = _shareurlShouldProxyInflight.get(host);
112+
if (shouldProxy !== undefined) {
113+
return _shareurlFetch(shouldProxy, fetchInput, fetchInit);
114+
} else if (inflight) {
115+
// shouldProxy pending: queue this fetch on it and issue a single fresh
116+
// request once shouldProxy is decided.
117+
return inflight.then(sp => _shareurlFetch(sp, fetchInput, fetchInit));
118+
} else {
119+
// First fetch to this host this page-load: run the race.
120+
const { responsePromise, shouldProxyPromise } = _shareurlRace(fetchInput, fetchInit);
121+
_shareurlShouldProxyInflight.set(host, shouldProxyPromise);
122+
shouldProxyPromise.then(sp => {
123+
_shareurlShouldProxy.set(host, sp);
124+
_shareurlShouldProxyInflight.delete(host);
125+
});
126+
return responsePromise;
127+
}
128+
};
129+
6130
const isEmbedded = window.parent !== window;
7131

8132
var shareAPI = makeShareAPI(process.env.CURRENT_PYRET_RELEASE);

0 commit comments

Comments
 (0)