Skip to content

Commit 3275d3d

Browse files
committed
Rewrite HttpClient handling
1 parent 0fd6b33 commit 3275d3d

3 files changed

Lines changed: 99 additions & 67 deletions

File tree

FrmMainApp.cs

Lines changed: 90 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
using System.Configuration;
22
using System.Diagnostics;
33
using System.Globalization;
4-
using System.Net;
54
using CsvHelper;
65
using HLWebScraper.Net.Helpers;
76
using HLWebScraper.Net.Model;
@@ -36,12 +35,22 @@ public partial class FrmMainApp : Form
3635
private KeyValueConfigurationCollection _section = new();
3736
private CancellationTokenSource cancellationTokenSource;
3837

38+
private static HttpClient _httpClient = new();
39+
3940
public FrmMainApp()
4041
{
4142
cancellationTokenSource = new CancellationTokenSource();
4243

4344
InitializeComponent();
4445
GetETFTypesFromCSV();
46+
47+
SocketsHttpHandler socketsHandler = new()
48+
{
49+
PooledConnectionLifetime = TimeSpan.FromMinutes(value: 2),
50+
MaxConnectionsPerServer = 100
51+
};
52+
53+
_httpClient = new HttpClient(handler: socketsHandler);
4554
}
4655

4756
private void FrmMainApp_Load(object sender, EventArgs e)
@@ -109,7 +118,6 @@ private void FillCbx_Securities(string filter = "")
109118
private static async Task<bool> ReadJsonFXFromWebAsync(FrmMainApp formInstance,
110119
CancellationToken cancellationToken)
111120
{
112-
WebClient wc = new();
113121
try
114122
{
115123
// Check if cancellation has been requested
@@ -123,7 +131,13 @@ private static async Task<bool> ReadJsonFXFromWebAsync(FrmMainApp formInstance,
123131
appendText: "Contacting FX Data website (http://www.floatrates.com/currency/gbp/)",
124132
logMessageType: LogMessageTypes.Info);
125133

126-
string jsonString = await wc.DownloadStringTaskAsync(address: FxUrl);
134+
135+
HttpResponseMessage response =
136+
await _httpClient.GetAsync(requestUri: FxUrl, cancellationToken: cancellationToken);
137+
response.EnsureSuccessStatusCode();
138+
139+
string jsonString = await response.Content.ReadAsStringAsync(cancellationToken: cancellationToken);
140+
127141
Dictionary<string, FXCurrency>? currencies =
128142
JsonConvert.DeserializeObject<Dictionary<string, FXCurrency>>(value: jsonString);
129143

@@ -188,9 +202,12 @@ private static async Task CollateHLStocksByLetterAsync(FrmMainApp formInstance,
188202
// Check if cancellation has been requested
189203
cancellationToken.ThrowIfCancellationRequested();
190204

191-
WebClient client = new();
192205
string url = "https://www.hl.co.uk/shares/shares-search-results/" + alphabetChar;
193-
respString = await client.DownloadStringTaskAsync(address: url);
206+
HttpResponseMessage response =
207+
await _httpClient.GetAsync(requestUri: url, cancellationToken: cancellationToken);
208+
response.EnsureSuccessStatusCode();
209+
210+
respString = await response.Content.ReadAsStringAsync(cancellationToken: cancellationToken);
194211
timeOutBool = true;
195212
}
196213
catch (Exception ex)
@@ -254,54 +271,31 @@ private static async Task FireAndForgetAsync(HashSet<string> urls, FrmMainApp fo
254271
AppendLogWindowText(tbx: formInstance.tbx_Log, appendText: "Scraping all items.",
255272
logMessageType: LogMessageTypes.Start);
256273

257-
// Chunk size for processing tasks
258-
const int chunkSize = 50;
259-
260-
// Create a list to hold the tasks
261274
List<Task> tasks = new();
262275

263-
// Iterate over the URLs in chunks
264-
for (int i = 0; i < urls.Count; i += chunkSize)
276+
// Loop through each URL in the chunk and start scraping
277+
foreach (string url in urls)
265278
{
266-
// Get the chunk of URLs
267-
IEnumerable<string> chunk = urls.Skip(count: i).Take(count: chunkSize);
268-
269-
// Create a list to hold the tasks for this chunk
270-
List<Task> chunkTasks = new();
271-
272-
// Create a new HttpClient for this chunk
273-
using HttpClient httpClient = new();
274-
275-
// Loop through each URL in the chunk and start scraping
276-
foreach (string url in chunk)
277-
{
278-
// Check if cancellation has been requested
279-
cancellationToken.ThrowIfCancellationRequested();
280-
281-
// Construct the URL with "/company-information" appended
282-
string companyInfoUrl = url + "/company-information";
283-
284-
// Add tasks to scrape the main URL and company info URL
285-
chunkTasks.Add(item: GetHtmlAsyncWithClient(httpClient: httpClient, url: url,
286-
formInstance: formInstance, cancellationToken: cancellationToken));
287-
chunkTasks.Add(item: GetHtmlAsyncWithClient(httpClient: httpClient, url: companyInfoUrl,
288-
formInstance: formInstance, cancellationToken: cancellationToken));
289-
}
290-
291-
// Add chunk tasks to the main tasks list
292-
tasks.AddRange(collection: chunkTasks);
279+
// Check if cancellation has been requested
280+
cancellationToken.ThrowIfCancellationRequested();
293281

294-
// Wait for the chunk tasks to complete or cancellation requested
295-
await Task.WhenAll(tasks: chunkTasks);
296-
297-
// Dispose of the HttpClient to close connections and release resources
298-
httpClient.Dispose();
282+
// Construct the URL with "/company-information" appended
283+
string companyInfoUrl = url + "/company-information";
299284

285+
// Add tasks to scrape the main URL and company info URL
286+
tasks.Add(item: GetHtmlAsync(url: url,
287+
formInstance: formInstance, cancellationToken: cancellationToken));
288+
tasks.Add(item: GetHtmlAsync(url: companyInfoUrl,
289+
formInstance: formInstance, cancellationToken: cancellationToken));
300290
// Break loop if cancellation requested
301291
if (cancellationToken.IsCancellationRequested)
302292
break;
303293
}
304294

295+
// Wait for the chunk tasks to complete or cancellation requested
296+
await Task.WhenAll(tasks: tasks);
297+
298+
305299
// Scraping completed
306300
AppendLogWindowText(tbx: formInstance.tbx_Log, appendText: "Scraping all items.",
307301
logMessageType: LogMessageTypes.Done);
@@ -315,31 +309,61 @@ private static async Task FireAndForgetAsync(HashSet<string> urls, FrmMainApp fo
315309
}
316310
}
317311

318-
private static async Task GetHtmlAsyncWithClient(HttpClient httpClient, string url, FrmMainApp formInstance,
319-
CancellationToken cancellationToken)
312+
private static async Task GetHtmlAsync(string url, FrmMainApp formInstance, CancellationToken cancellationToken)
320313
{
321-
// Check if cancellation has been requested
314+
// Check if cancellation has been requested before making the request
322315
cancellationToken.ThrowIfCancellationRequested();
323316

324-
// Make the HTTP request
325-
HttpResponseMessage response = await httpClient.GetAsync(requestUri: url, cancellationToken: cancellationToken);
326-
string htmlContent = await response.Content.ReadAsStringAsync(cancellationToken: cancellationToken);
327-
328-
// Process the HTML content
329-
// Page
330-
if (!url.Contains(value: "company-info"))
331-
urlAndHtmlContentHashtable.AddOrUpdate(key: url,
332-
value: HelperStringUtils.TrimAndReplaceNewLinesAndTabs(
333-
text: ReturnPageText(
334-
HTMLTextInHtmlContentHashtable: HelperStringUtils.TrimInternalSpaces(s: htmlContent))));
335-
// Company
336-
else
337-
urlAndCompanyInfoHashtable.AddOrUpdate(key: url,
338-
value: HelperStringUtils.TrimAndReplaceNewLinesAndTabs(
339-
text: ReturnCompanyPageText(
340-
HTMLTextInCompanyInfoHashtable: HelperStringUtils.TrimInternalSpaces(s: htmlContent))));
317+
int maxRetries = 5;
318+
int retryCount = 0;
319+
320+
while (retryCount < maxRetries)
321+
try
322+
{
323+
Application.DoEvents();
324+
HttpResponseMessage response =
325+
await _httpClient.GetAsync(requestUri: url, cancellationToken: cancellationToken);
326+
response.EnsureSuccessStatusCode();
327+
string htmlContent = await response.Content.ReadAsStringAsync(cancellationToken: cancellationToken);
328+
329+
// Process the HTML content as before
330+
if (!url.Contains(value: "company-info"))
331+
urlAndHtmlContentHashtable.AddOrUpdate(key: url,
332+
value: HelperStringUtils.TrimAndReplaceNewLinesAndTabs(
333+
text: ReturnPageText(
334+
HTMLTextInHtmlContentHashtable: HelperStringUtils.TrimInternalSpaces(s: htmlContent))));
335+
else
336+
urlAndCompanyInfoHashtable.AddOrUpdate(key: url,
337+
value: HelperStringUtils.TrimAndReplaceNewLinesAndTabs(
338+
text: ReturnCompanyPageText(
339+
HTMLTextInCompanyInfoHashtable: HelperStringUtils.TrimInternalSpaces(s: htmlContent))));
340+
341+
IncrementCounterAndLogProgress(url: url, formInstance: formInstance, isSuccess: true);
341342

342-
IncrementCounterAndLogProgress(url: url, formInstance: formInstance, isSuccess: true);
343+
// If the request succeeds, exit the retry loop
344+
return;
345+
}
346+
catch (OperationCanceledException ex) when (ex.InnerException is TimeoutException)
347+
{
348+
Application.DoEvents();
349+
// Timeout occurred, increment retry count
350+
retryCount++;
351+
// Log the timeout error
352+
AppendLogWindowText(tbx: formInstance.tbx_Log,
353+
appendText:
354+
$"Timeout occurred while fetching URL '{url}'. Retry attempt {retryCount} of {maxRetries}.");
355+
}
356+
catch (Exception ex)
357+
{
358+
// Log other errors but do not retry
359+
if (ex is HttpRequestException)
360+
IncrementCounterAndLogProgress(url: url, formInstance: formInstance, isSuccess: false,
361+
errorMsg: ex.Message);
362+
return;
363+
}
364+
365+
// Maximum retry attempts reached, log error and exit
366+
AppendLogWindowText(tbx: formInstance.tbx_Log, appendText: $"Maximum retry attempts reached for URL '{url}'.");
343367
}
344368

345369

@@ -363,8 +387,8 @@ private static async Task GetHtmlAsyncWithClient(HttpClient httpClient, string u
363387
{
364388
string logMessageVal = $"Parsing {url}";
365389
FrmMainApp frmMainAppInstance = (FrmMainApp)Application.OpenForms[name: "FrmMainApp"];
366-
AppendLogWindowText(tbx: frmMainAppInstance.tbx_Log, appendText: logMessageVal,
367-
logMessageType: LogMessageTypes.Start);
390+
//AppendLogWindowText(tbx: frmMainAppInstance.tbx_Log, appendText: logMessageVal,
391+
// logMessageType: LogMessageTypes.Start);
368392

369393
string pageText = HTMLTextInHtmlContentHashtable;
370394
string companyPageText = HTMLTextInCompanyInfoHashtable;

changelog.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# HLWebscraper.Net Changelog
22

3+
**Build 88xx [202403xx] **
4+
- NEW & UPDATED:
5+
- I've rewritten the HttpClient handling. Should have read up on the documentation beforehand - no more loops!
6+
7+
- BUGS & FIXES:
8+
- N/A
9+
310
**Build 8840 [20240315] **
411
- NEW & UPDATED:
512
- Added logic to allow for limited selection of scraping targets

readme.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ On that note the app can only collect stuff that's either visibly available on t
4040

4141
### Performance
4242

43-
- The whole end-to-end process takes a fair bit of time so be patient. In my one test run the whole thing took in the vicinity of short of an hour.
43+
- The whole data pull is around 4.5GB but it appears to be capped on the server side somewhere around 15-20MB/sec (that's around 200 megabits/sec).
44+
- The whole end-to-end process (assuming favourable conditions) takes around 10-15 mins.
4445
- The app uses a library (`CompressedMemoryCache.cs`) - the licence of that is contained in the file and was built by Gustavo Augusto Hennig (it's APACHE 2.0 btw)
4546
- It's necessary to use compression on the html pages because storing that many (read: tens of thousands) pages at 200-400kbytes each will eat up memory in no time. My initial tests of letters A-C made the app consume around 5GB RAM w/o compression and sub-1GB w/ compression.
4647
- I did some testing on what I deem is a 'normal' performance laptop, ie a Ryzen 7 PRO 5850U [8x 4370 MHz] w/ 32 GB RAM - the Release (non-Debug that is) version of the app peaked at 25% CPU and (again) around sub-1GB RAM.

0 commit comments

Comments
 (0)