From e5df149808d41ed54d53cc0a3263ab6d4866387e Mon Sep 17 00:00:00 2001 From: akinayturan Date: Wed, 4 Mar 2026 22:54:25 +0300 Subject: [PATCH] Add repair mode for missing year fields in vehicle data; update README with usage instructions --- .gitignore | 4 +- README.md | 6 + src/HsnTsnScraper/Program.cs | 360 +++++++++++++++++++++++++---------- 3 files changed, 272 insertions(+), 98 deletions(-) diff --git a/.gitignore b/.gitignore index 384836e..6bb948c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ bin/ obj/ .idea/ -hsntsn.csv \ No newline at end of file +hsntsn.csv +hsntsn.repaired.csv +scrape.log \ No newline at end of file diff --git a/README.md b/README.md index 31d79e5..2692393 100644 --- a/README.md +++ b/README.md @@ -32,3 +32,9 @@ Enable detail-page enrichment: ```bash printf "0588\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --include-details ``` + +Repair only missing year fields from an existing CSV: + +```bash +dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --repair-years --input-csv hsntsn.csv --output-csv hsntsn.repaired.csv +``` diff --git a/src/HsnTsnScraper/Program.cs b/src/HsnTsnScraper/Program.cs index 588c495..f557b2c 100644 --- a/src/HsnTsnScraper/Program.cs +++ b/src/HsnTsnScraper/Program.cs @@ -1,136 +1,302 @@ using System.Globalization; +using System.Net; +using System.Text.RegularExpressions; using CsvHelper; using CsvHelper.Configuration; using HsnTsnScraper; +var repairYears = args.Contains("--repair-years", StringComparer.OrdinalIgnoreCase); var includeDetails = args.Contains("--include-details", StringComparer.OrdinalIgnoreCase); +var inputCsv = GetOptionValue(args, "--input-csv"); +var outputCsv = GetOptionValue(args, "--output-csv"); using var client = new HsnTsnClient(); -var written = new HashSet(StringComparer.OrdinalIgnoreCase); -var processed = 0; -var failed = 0; -Console.Error.WriteLine($"[info] Scrape started. includeDetails={includeDetails}"); - -await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture) +if (repairYears) { - Delimiter = ";" -}); - -csvWriter.WriteHeader(); -await csvWriter.NextRecordAsync(); -await csvWriter.FlushAsync(); - -if (Console.IsInputRedirected) -{ - await foreach (var query in ReadInput()) - { - IReadOnlyList vehicles; - try - { - vehicles = await client.GetVehiclesFromSearchAsync(query); - } - catch (Exception ex) - { - failed++; - Console.Error.WriteLine($"[warn] Query failed: {query} -> {ex.Message}"); - continue; - } - - foreach (var vehicle in vehicles) - { - await WriteVehicleIfNew(vehicle); - } - } -} -else -{ - IReadOnlyList brandUrls; - try - { - brandUrls = await client.GetBrandPageUrls(); - } - catch (Exception ex) - { - Console.Error.WriteLine($"[error] Could not fetch brand urls: {ex.Message}"); - return; - } - - foreach (var url in brandUrls) - { - Console.Error.WriteLine($"[info] Processing: {url}"); - IReadOnlyList vehicles; - try - { - vehicles = await client.GetVehiclesFromBrandPageAsync(url); - } - catch (Exception ex) - { - failed++; - Console.Error.WriteLine($"[warn] Brand page failed: {url} -> {ex.Message}"); - continue; - } - - foreach (var vehicle in vehicles) - { - await WriteVehicleIfNew(vehicle); - } - } + await RunRepairYearsMode(client, inputCsv, outputCsv); + return; } -await csvWriter.FlushAsync(); -Console.Error.WriteLine($"[info] Scrape finished. written={written.Count}, processed={processed}, failed={failed}"); +await RunScrapeMode(client, includeDetails); return; -async Task WriteVehicleIfNew(HsnTsnVehicle vehicle) +async Task RunRepairYearsMode(HsnTsnClient hsnTsnClient, string? inputPath, string? outputPath) { - processed++; + var inputCsvPath = string.IsNullOrWhiteSpace(inputPath) ? "hsntsn.csv" : inputPath; + var outputCsvPath = string.IsNullOrWhiteSpace(outputPath) ? "hsntsn.repaired.csv" : outputPath; - if (!written.Add(vehicle.HsnTsn)) + if (Path.GetFullPath(inputCsvPath).Equals(Path.GetFullPath(outputCsvPath), StringComparison.OrdinalIgnoreCase)) { + Console.Error.WriteLine("[error] --input-csv and --output-csv cannot point to the same file."); return; } - if (includeDetails && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl)) + Console.Error.WriteLine($"[info] Repair mode started. input={inputCsvPath}, output={outputCsvPath}"); + + var processed = 0; + var failed = 0; + var updated = 0; + + await using var inputStream = File.OpenRead(inputCsvPath); + using var inputReader = new StreamReader(inputStream); + using var csvReader = new CsvReader(inputReader, new CsvConfiguration(CultureInfo.InvariantCulture) { - try + Delimiter = ";", + MissingFieldFound = null, + HeaderValidated = null + }); + + await using var outputStream = File.Create(outputCsvPath); + await using var outputWriter = new StreamWriter(outputStream); + await using var csvWriter = new CsvWriter(outputWriter, new CsvConfiguration(CultureInfo.InvariantCulture) + { + Delimiter = ";" + }); + + await csvReader.ReadAsync(); + csvReader.ReadHeader(); + + csvWriter.WriteHeader(); + await csvWriter.NextRecordAsync(); + + await foreach (var record in csvReader.GetRecordsAsync()) + { + processed++; + var changed = false; + + if ((record.YearFrom is null || record.YearTo is null) && !string.IsNullOrWhiteSpace(record.SourceDetailUrl)) { - var detail = await client.GetVehicleDetailAsync(vehicle.SourceDetailUrl); - if (detail is not null) + try { - if (!string.IsNullOrWhiteSpace(detail.Brand)) + var detail = await GetVehicleDetailWithRetry(hsnTsnClient, record.SourceDetailUrl, record.HsnTsn); + if (detail is not null) { - vehicle.Brand = detail.Brand; - } + if (record.YearFrom is null && detail.YearFrom is not null) + { + record.YearFrom = detail.YearFrom; + changed = true; + } - vehicle.Model = detail.Model; - vehicle.OfficialType = detail.OfficialType; - vehicle.YearFrom = detail.YearFrom; - vehicle.YearTo = detail.YearTo; - - if (!string.IsNullOrWhiteSpace(detail.CanonicalUrl)) - { - vehicle.SourceDetailUrl = detail.CanonicalUrl; + if (record.YearTo is null && detail.YearTo is not null) + { + record.YearTo = detail.YearTo; + changed = true; + } } } + catch (Exception ex) + { + failed++; + Console.Error.WriteLine($"[warn] Repair failed: {record.HsnTsn} -> {ex.Message}"); + } + } + + if (changed) + { + updated++; + } + + record.MatchKey = BuildMatchKey(record); + csvWriter.WriteRecord(record); + await csvWriter.NextRecordAsync(); + + if (processed % 250 == 0) + { + await csvWriter.FlushAsync(); + Console.Error.WriteLine($"[info] Repair progress: processed={processed}, updated={updated}, failed={failed}"); + } + } + + await csvWriter.FlushAsync(); + Console.Error.WriteLine($"[info] Repair finished. processed={processed}, updated={updated}, failed={failed}"); +} + +async Task RunScrapeMode(HsnTsnClient hsnTsnClient, bool includeDetailPages) +{ + var written = new HashSet(StringComparer.OrdinalIgnoreCase); + var processed = 0; + var failed = 0; + + Console.Error.WriteLine($"[info] Scrape started. includeDetails={includeDetailPages}"); + + await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture) + { + Delimiter = ";" + }); + + csvWriter.WriteHeader(); + await csvWriter.NextRecordAsync(); + await csvWriter.FlushAsync(); + + if (Console.IsInputRedirected) + { + await foreach (var query in ReadInput()) + { + IReadOnlyList vehicles; + try + { + vehicles = await hsnTsnClient.GetVehiclesFromSearchAsync(query); + } + catch (Exception ex) + { + failed++; + Console.Error.WriteLine($"[warn] Query failed: {query} -> {ex.Message}"); + continue; + } + + foreach (var vehicle in vehicles) + { + await WriteVehicleIfNew(vehicle); + } + } + } + else + { + IReadOnlyList brandUrls; + try + { + brandUrls = await hsnTsnClient.GetBrandPageUrls(); } catch (Exception ex) { - failed++; - Console.Error.WriteLine($"[warn] Detail failed: {vehicle.HsnTsn} -> {ex.Message}"); + Console.Error.WriteLine($"[error] Could not fetch brand urls: {ex.Message}"); + return; + } + + foreach (var url in brandUrls) + { + Console.Error.WriteLine($"[info] Processing: {url}"); + IReadOnlyList vehicles; + try + { + vehicles = await hsnTsnClient.GetVehiclesFromBrandPageAsync(url); + } + catch (Exception ex) + { + failed++; + Console.Error.WriteLine($"[warn] Brand page failed: {url} -> {ex.Message}"); + continue; + } + + foreach (var vehicle in vehicles) + { + await WriteVehicleIfNew(vehicle); + } } } - vehicle.MatchKey = BuildMatchKey(vehicle); - csvWriter.WriteRecord(vehicle); - await csvWriter.NextRecordAsync(); + await csvWriter.FlushAsync(); + Console.Error.WriteLine($"[info] Scrape finished. written={written.Count}, processed={processed}, failed={failed}"); + return; - if (written.Count % 250 == 0) + async Task WriteVehicleIfNew(HsnTsnVehicle vehicle) { - await csvWriter.FlushAsync(); + processed++; + + if (!written.Add(vehicle.HsnTsn)) + { + return; + } + + if (includeDetailPages && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl)) + { + try + { + var detail = await GetVehicleDetailWithRetry(hsnTsnClient, vehicle.SourceDetailUrl, vehicle.HsnTsn); + if (detail is not null) + { + if (!string.IsNullOrWhiteSpace(detail.Brand)) + { + vehicle.Brand = detail.Brand; + } + + vehicle.Model = detail.Model; + vehicle.OfficialType = detail.OfficialType; + vehicle.YearFrom = detail.YearFrom; + vehicle.YearTo = detail.YearTo; + + if (!string.IsNullOrWhiteSpace(detail.CanonicalUrl)) + { + vehicle.SourceDetailUrl = detail.CanonicalUrl; + } + } + } + catch (Exception ex) + { + failed++; + Console.Error.WriteLine($"[warn] Detail failed: {vehicle.HsnTsn} -> {ex.Message}"); + } + } + + vehicle.MatchKey = BuildMatchKey(vehicle); + csvWriter.WriteRecord(vehicle); + await csvWriter.NextRecordAsync(); + + if (written.Count % 250 == 0) + { + await csvWriter.FlushAsync(); + } } } +static string? GetOptionValue(string[] cliArgs, string optionName) +{ + for (var i = 0; i < cliArgs.Length; i++) + { + var arg = cliArgs[i]; + if (arg.Equals(optionName, StringComparison.OrdinalIgnoreCase)) + { + if (i + 1 < cliArgs.Length) + { + return cliArgs[i + 1]; + } + + return null; + } + + var prefix = optionName + "="; + if (arg.StartsWith(prefix, StringComparison.OrdinalIgnoreCase)) + { + return arg[prefix.Length..]; + } + } + + return null; +} + +async Task GetVehicleDetailWithRetry(HsnTsnClient hsnTsnClient, string detailUrl, string hsnTsn) +{ + const int maxAttempts = 7; + Console.Error.WriteLine($"[info] Fetching detail for HSN/TSN: {hsnTsn}"); + + for (var attempt = 1; attempt <= maxAttempts; attempt++) + { + try + { + return await hsnTsnClient.GetVehicleDetailAsync(detailUrl); + } + catch (HttpRequestException ex) when (IsTooManyRequests(ex) && attempt < maxAttempts) + { + var delaySeconds = Math.Min(60, (int)Math.Pow(2, attempt)); + Console.Error.WriteLine($"[warn] 429 for detail url, retrying in {delaySeconds}s (attempt {attempt}/{maxAttempts})"); + await Task.Delay(TimeSpan.FromSeconds(delaySeconds)); + } + } + + return await hsnTsnClient.GetVehicleDetailAsync(detailUrl); +} + +static bool IsTooManyRequests(HttpRequestException ex) +{ + if (ex.StatusCode == HttpStatusCode.TooManyRequests) + { + return true; + } + + return ex.Message.Contains("429", StringComparison.OrdinalIgnoreCase); +} + async IAsyncEnumerable ReadInput() { var seen = new HashSet(StringComparer.OrdinalIgnoreCase); @@ -160,6 +326,6 @@ string BuildMatchKey(HsnTsnVehicle vehicle) .Replace("Ü", "UE") .Replace("ß", "SS"); - normalized = System.Text.RegularExpressions.Regex.Replace(normalized, @"[^A-Z0-9]+", " ").Trim(); - return System.Text.RegularExpressions.Regex.Replace(normalized, @"\s+", " "); + normalized = Regex.Replace(normalized, @"[^A-Z0-9]+", " ").Trim(); + return Regex.Replace(normalized, @"\s+", " "); }