Add repair mode for missing year fields in vehicle data; update README with usage instructions

This commit is contained in:
2026-03-04 22:54:25 +03:00
parent 3e88086872
commit e5df149808
3 changed files with 272 additions and 98 deletions
+2
View File
@@ -2,3 +2,5 @@ bin/
obj/
.idea/
hsntsn.csv
hsntsn.repaired.csv
scrape.log
+6
View File
@@ -32,3 +32,9 @@ Enable detail-page enrichment:
```bash
printf "0588\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --include-details
```
Repair only missing year fields from an existing CSV:
```bash
dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --repair-years --input-csv hsntsn.csv --output-csv hsntsn.repaired.csv
```
+193 -27
View File
@@ -1,34 +1,142 @@
using System.Globalization;
using System.Net;
using System.Text.RegularExpressions;
using CsvHelper;
using CsvHelper.Configuration;
using HsnTsnScraper;
var repairYears = args.Contains("--repair-years", StringComparer.OrdinalIgnoreCase);
var includeDetails = args.Contains("--include-details", StringComparer.OrdinalIgnoreCase);
var inputCsv = GetOptionValue(args, "--input-csv");
var outputCsv = GetOptionValue(args, "--output-csv");
using var client = new HsnTsnClient();
var written = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var processed = 0;
var failed = 0;
Console.Error.WriteLine($"[info] Scrape started. includeDetails={includeDetails}");
await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture)
if (repairYears)
{
await RunRepairYearsMode(client, inputCsv, outputCsv);
return;
}
await RunScrapeMode(client, includeDetails);
return;
async Task RunRepairYearsMode(HsnTsnClient hsnTsnClient, string? inputPath, string? outputPath)
{
var inputCsvPath = string.IsNullOrWhiteSpace(inputPath) ? "hsntsn.csv" : inputPath;
var outputCsvPath = string.IsNullOrWhiteSpace(outputPath) ? "hsntsn.repaired.csv" : outputPath;
if (Path.GetFullPath(inputCsvPath).Equals(Path.GetFullPath(outputCsvPath), StringComparison.OrdinalIgnoreCase))
{
Console.Error.WriteLine("[error] --input-csv and --output-csv cannot point to the same file.");
return;
}
Console.Error.WriteLine($"[info] Repair mode started. input={inputCsvPath}, output={outputCsvPath}");
var processed = 0;
var failed = 0;
var updated = 0;
await using var inputStream = File.OpenRead(inputCsvPath);
using var inputReader = new StreamReader(inputStream);
using var csvReader = new CsvReader(inputReader, new CsvConfiguration(CultureInfo.InvariantCulture)
{
Delimiter = ";",
MissingFieldFound = null,
HeaderValidated = null
});
await using var outputStream = File.Create(outputCsvPath);
await using var outputWriter = new StreamWriter(outputStream);
await using var csvWriter = new CsvWriter(outputWriter, new CsvConfiguration(CultureInfo.InvariantCulture)
{
Delimiter = ";"
});
});
csvWriter.WriteHeader<HsnTsnVehicle>();
await csvWriter.NextRecordAsync();
await csvWriter.FlushAsync();
await csvReader.ReadAsync();
csvReader.ReadHeader();
if (Console.IsInputRedirected)
csvWriter.WriteHeader<HsnTsnVehicle>();
await csvWriter.NextRecordAsync();
await foreach (var record in csvReader.GetRecordsAsync<HsnTsnVehicle>())
{
processed++;
var changed = false;
if ((record.YearFrom is null || record.YearTo is null) && !string.IsNullOrWhiteSpace(record.SourceDetailUrl))
{
try
{
var detail = await GetVehicleDetailWithRetry(hsnTsnClient, record.SourceDetailUrl, record.HsnTsn);
if (detail is not null)
{
if (record.YearFrom is null && detail.YearFrom is not null)
{
record.YearFrom = detail.YearFrom;
changed = true;
}
if (record.YearTo is null && detail.YearTo is not null)
{
record.YearTo = detail.YearTo;
changed = true;
}
}
}
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] Repair failed: {record.HsnTsn} -> {ex.Message}");
}
}
if (changed)
{
updated++;
}
record.MatchKey = BuildMatchKey(record);
csvWriter.WriteRecord(record);
await csvWriter.NextRecordAsync();
if (processed % 250 == 0)
{
await csvWriter.FlushAsync();
Console.Error.WriteLine($"[info] Repair progress: processed={processed}, updated={updated}, failed={failed}");
}
}
await csvWriter.FlushAsync();
Console.Error.WriteLine($"[info] Repair finished. processed={processed}, updated={updated}, failed={failed}");
}
async Task RunScrapeMode(HsnTsnClient hsnTsnClient, bool includeDetailPages)
{
var written = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var processed = 0;
var failed = 0;
Console.Error.WriteLine($"[info] Scrape started. includeDetails={includeDetailPages}");
await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture)
{
Delimiter = ";"
});
csvWriter.WriteHeader<HsnTsnVehicle>();
await csvWriter.NextRecordAsync();
await csvWriter.FlushAsync();
if (Console.IsInputRedirected)
{
await foreach (var query in ReadInput())
{
IReadOnlyList<HsnTsnVehicle> vehicles;
try
{
vehicles = await client.GetVehiclesFromSearchAsync(query);
vehicles = await hsnTsnClient.GetVehiclesFromSearchAsync(query);
}
catch (Exception ex)
{
@@ -42,13 +150,13 @@ if (Console.IsInputRedirected)
await WriteVehicleIfNew(vehicle);
}
}
}
else
{
}
else
{
IReadOnlyList<string> brandUrls;
try
{
brandUrls = await client.GetBrandPageUrls();
brandUrls = await hsnTsnClient.GetBrandPageUrls();
}
catch (Exception ex)
{
@@ -62,7 +170,7 @@ else
IReadOnlyList<HsnTsnVehicle> vehicles;
try
{
vehicles = await client.GetVehiclesFromBrandPageAsync(url);
vehicles = await hsnTsnClient.GetVehiclesFromBrandPageAsync(url);
}
catch (Exception ex)
{
@@ -76,14 +184,14 @@ else
await WriteVehicleIfNew(vehicle);
}
}
}
}
await csvWriter.FlushAsync();
Console.Error.WriteLine($"[info] Scrape finished. written={written.Count}, processed={processed}, failed={failed}");
return;
await csvWriter.FlushAsync();
Console.Error.WriteLine($"[info] Scrape finished. written={written.Count}, processed={processed}, failed={failed}");
return;
async Task WriteVehicleIfNew(HsnTsnVehicle vehicle)
{
async Task WriteVehicleIfNew(HsnTsnVehicle vehicle)
{
processed++;
if (!written.Add(vehicle.HsnTsn))
@@ -91,11 +199,11 @@ async Task WriteVehicleIfNew(HsnTsnVehicle vehicle)
return;
}
if (includeDetails && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl))
if (includeDetailPages && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl))
{
try
{
var detail = await client.GetVehicleDetailAsync(vehicle.SourceDetailUrl);
var detail = await GetVehicleDetailWithRetry(hsnTsnClient, vehicle.SourceDetailUrl, vehicle.HsnTsn);
if (detail is not null)
{
if (!string.IsNullOrWhiteSpace(detail.Brand))
@@ -129,6 +237,64 @@ async Task WriteVehicleIfNew(HsnTsnVehicle vehicle)
{
await csvWriter.FlushAsync();
}
}
}
static string? GetOptionValue(string[] cliArgs, string optionName)
{
for (var i = 0; i < cliArgs.Length; i++)
{
var arg = cliArgs[i];
if (arg.Equals(optionName, StringComparison.OrdinalIgnoreCase))
{
if (i + 1 < cliArgs.Length)
{
return cliArgs[i + 1];
}
return null;
}
var prefix = optionName + "=";
if (arg.StartsWith(prefix, StringComparison.OrdinalIgnoreCase))
{
return arg[prefix.Length..];
}
}
return null;
}
async Task<VehicleDetail?> GetVehicleDetailWithRetry(HsnTsnClient hsnTsnClient, string detailUrl, string hsnTsn)
{
const int maxAttempts = 7;
Console.Error.WriteLine($"[info] Fetching detail for HSN/TSN: {hsnTsn}");
for (var attempt = 1; attempt <= maxAttempts; attempt++)
{
try
{
return await hsnTsnClient.GetVehicleDetailAsync(detailUrl);
}
catch (HttpRequestException ex) when (IsTooManyRequests(ex) && attempt < maxAttempts)
{
var delaySeconds = Math.Min(60, (int)Math.Pow(2, attempt));
Console.Error.WriteLine($"[warn] 429 for detail url, retrying in {delaySeconds}s (attempt {attempt}/{maxAttempts})");
await Task.Delay(TimeSpan.FromSeconds(delaySeconds));
}
}
return await hsnTsnClient.GetVehicleDetailAsync(detailUrl);
}
static bool IsTooManyRequests(HttpRequestException ex)
{
if (ex.StatusCode == HttpStatusCode.TooManyRequests)
{
return true;
}
return ex.Message.Contains("429", StringComparison.OrdinalIgnoreCase);
}
async IAsyncEnumerable<string> ReadInput()
@@ -160,6 +326,6 @@ string BuildMatchKey(HsnTsnVehicle vehicle)
.Replace("Ü", "UE")
.Replace("ß", "SS");
normalized = System.Text.RegularExpressions.Regex.Replace(normalized, @"[^A-Z0-9]+", " ").Trim();
return System.Text.RegularExpressions.Regex.Replace(normalized, @"\s+", " ");
normalized = Regex.Replace(normalized, @"[^A-Z0-9]+", " ").Trim();
return Regex.Replace(normalized, @"\s+", " ");
}