Add repair mode for missing year fields in vehicle data; update README with usage instructions

This commit is contained in:
2026-03-04 22:54:25 +03:00
parent 3e88086872
commit e5df149808
3 changed files with 272 additions and 98 deletions
+2
View File
@@ -2,3 +2,5 @@ bin/
obj/
.idea/
hsntsn.csv
hsntsn.repaired.csv
scrape.log
+6
View File
@@ -32,3 +32,9 @@ Enable detail-page enrichment:
```bash
printf "0588\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --include-details
```
Repair only missing year fields from an existing CSV:
```bash
dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --repair-years --input-csv hsntsn.csv --output-csv hsntsn.repaired.csv
```
+263 -97
View File
@@ -1,136 +1,302 @@
using System.Globalization;
using System.Net;
using System.Text.RegularExpressions;
using CsvHelper;
using CsvHelper.Configuration;
using HsnTsnScraper;
var repairYears = args.Contains("--repair-years", StringComparer.OrdinalIgnoreCase);
var includeDetails = args.Contains("--include-details", StringComparer.OrdinalIgnoreCase);
var inputCsv = GetOptionValue(args, "--input-csv");
var outputCsv = GetOptionValue(args, "--output-csv");
using var client = new HsnTsnClient();
var written = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var processed = 0;
var failed = 0;
Console.Error.WriteLine($"[info] Scrape started. includeDetails={includeDetails}");
await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture)
if (repairYears)
{
Delimiter = ";"
});
csvWriter.WriteHeader<HsnTsnVehicle>();
await csvWriter.NextRecordAsync();
await csvWriter.FlushAsync();
if (Console.IsInputRedirected)
{
await foreach (var query in ReadInput())
{
IReadOnlyList<HsnTsnVehicle> vehicles;
try
{
vehicles = await client.GetVehiclesFromSearchAsync(query);
}
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] Query failed: {query} -> {ex.Message}");
continue;
}
foreach (var vehicle in vehicles)
{
await WriteVehicleIfNew(vehicle);
}
}
}
else
{
IReadOnlyList<string> brandUrls;
try
{
brandUrls = await client.GetBrandPageUrls();
}
catch (Exception ex)
{
Console.Error.WriteLine($"[error] Could not fetch brand urls: {ex.Message}");
return;
}
foreach (var url in brandUrls)
{
Console.Error.WriteLine($"[info] Processing: {url}");
IReadOnlyList<HsnTsnVehicle> vehicles;
try
{
vehicles = await client.GetVehiclesFromBrandPageAsync(url);
}
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] Brand page failed: {url} -> {ex.Message}");
continue;
}
foreach (var vehicle in vehicles)
{
await WriteVehicleIfNew(vehicle);
}
}
await RunRepairYearsMode(client, inputCsv, outputCsv);
return;
}
await csvWriter.FlushAsync();
Console.Error.WriteLine($"[info] Scrape finished. written={written.Count}, processed={processed}, failed={failed}");
await RunScrapeMode(client, includeDetails);
return;
async Task WriteVehicleIfNew(HsnTsnVehicle vehicle)
async Task RunRepairYearsMode(HsnTsnClient hsnTsnClient, string? inputPath, string? outputPath)
{
processed++;
var inputCsvPath = string.IsNullOrWhiteSpace(inputPath) ? "hsntsn.csv" : inputPath;
var outputCsvPath = string.IsNullOrWhiteSpace(outputPath) ? "hsntsn.repaired.csv" : outputPath;
if (!written.Add(vehicle.HsnTsn))
if (Path.GetFullPath(inputCsvPath).Equals(Path.GetFullPath(outputCsvPath), StringComparison.OrdinalIgnoreCase))
{
Console.Error.WriteLine("[error] --input-csv and --output-csv cannot point to the same file.");
return;
}
if (includeDetails && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl))
Console.Error.WriteLine($"[info] Repair mode started. input={inputCsvPath}, output={outputCsvPath}");
var processed = 0;
var failed = 0;
var updated = 0;
await using var inputStream = File.OpenRead(inputCsvPath);
using var inputReader = new StreamReader(inputStream);
using var csvReader = new CsvReader(inputReader, new CsvConfiguration(CultureInfo.InvariantCulture)
{
try
Delimiter = ";",
MissingFieldFound = null,
HeaderValidated = null
});
await using var outputStream = File.Create(outputCsvPath);
await using var outputWriter = new StreamWriter(outputStream);
await using var csvWriter = new CsvWriter(outputWriter, new CsvConfiguration(CultureInfo.InvariantCulture)
{
Delimiter = ";"
});
await csvReader.ReadAsync();
csvReader.ReadHeader();
csvWriter.WriteHeader<HsnTsnVehicle>();
await csvWriter.NextRecordAsync();
await foreach (var record in csvReader.GetRecordsAsync<HsnTsnVehicle>())
{
processed++;
var changed = false;
if ((record.YearFrom is null || record.YearTo is null) && !string.IsNullOrWhiteSpace(record.SourceDetailUrl))
{
var detail = await client.GetVehicleDetailAsync(vehicle.SourceDetailUrl);
if (detail is not null)
try
{
if (!string.IsNullOrWhiteSpace(detail.Brand))
var detail = await GetVehicleDetailWithRetry(hsnTsnClient, record.SourceDetailUrl, record.HsnTsn);
if (detail is not null)
{
vehicle.Brand = detail.Brand;
}
if (record.YearFrom is null && detail.YearFrom is not null)
{
record.YearFrom = detail.YearFrom;
changed = true;
}
vehicle.Model = detail.Model;
vehicle.OfficialType = detail.OfficialType;
vehicle.YearFrom = detail.YearFrom;
vehicle.YearTo = detail.YearTo;
if (!string.IsNullOrWhiteSpace(detail.CanonicalUrl))
{
vehicle.SourceDetailUrl = detail.CanonicalUrl;
if (record.YearTo is null && detail.YearTo is not null)
{
record.YearTo = detail.YearTo;
changed = true;
}
}
}
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] Repair failed: {record.HsnTsn} -> {ex.Message}");
}
}
if (changed)
{
updated++;
}
record.MatchKey = BuildMatchKey(record);
csvWriter.WriteRecord(record);
await csvWriter.NextRecordAsync();
if (processed % 250 == 0)
{
await csvWriter.FlushAsync();
Console.Error.WriteLine($"[info] Repair progress: processed={processed}, updated={updated}, failed={failed}");
}
}
await csvWriter.FlushAsync();
Console.Error.WriteLine($"[info] Repair finished. processed={processed}, updated={updated}, failed={failed}");
}
async Task RunScrapeMode(HsnTsnClient hsnTsnClient, bool includeDetailPages)
{
var written = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var processed = 0;
var failed = 0;
Console.Error.WriteLine($"[info] Scrape started. includeDetails={includeDetailPages}");
await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture)
{
Delimiter = ";"
});
csvWriter.WriteHeader<HsnTsnVehicle>();
await csvWriter.NextRecordAsync();
await csvWriter.FlushAsync();
if (Console.IsInputRedirected)
{
await foreach (var query in ReadInput())
{
IReadOnlyList<HsnTsnVehicle> vehicles;
try
{
vehicles = await hsnTsnClient.GetVehiclesFromSearchAsync(query);
}
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] Query failed: {query} -> {ex.Message}");
continue;
}
foreach (var vehicle in vehicles)
{
await WriteVehicleIfNew(vehicle);
}
}
}
else
{
IReadOnlyList<string> brandUrls;
try
{
brandUrls = await hsnTsnClient.GetBrandPageUrls();
}
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] Detail failed: {vehicle.HsnTsn} -> {ex.Message}");
Console.Error.WriteLine($"[error] Could not fetch brand urls: {ex.Message}");
return;
}
foreach (var url in brandUrls)
{
Console.Error.WriteLine($"[info] Processing: {url}");
IReadOnlyList<HsnTsnVehicle> vehicles;
try
{
vehicles = await hsnTsnClient.GetVehiclesFromBrandPageAsync(url);
}
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] Brand page failed: {url} -> {ex.Message}");
continue;
}
foreach (var vehicle in vehicles)
{
await WriteVehicleIfNew(vehicle);
}
}
}
vehicle.MatchKey = BuildMatchKey(vehicle);
csvWriter.WriteRecord(vehicle);
await csvWriter.NextRecordAsync();
await csvWriter.FlushAsync();
Console.Error.WriteLine($"[info] Scrape finished. written={written.Count}, processed={processed}, failed={failed}");
return;
if (written.Count % 250 == 0)
async Task WriteVehicleIfNew(HsnTsnVehicle vehicle)
{
await csvWriter.FlushAsync();
processed++;
if (!written.Add(vehicle.HsnTsn))
{
return;
}
if (includeDetailPages && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl))
{
try
{
var detail = await GetVehicleDetailWithRetry(hsnTsnClient, vehicle.SourceDetailUrl, vehicle.HsnTsn);
if (detail is not null)
{
if (!string.IsNullOrWhiteSpace(detail.Brand))
{
vehicle.Brand = detail.Brand;
}
vehicle.Model = detail.Model;
vehicle.OfficialType = detail.OfficialType;
vehicle.YearFrom = detail.YearFrom;
vehicle.YearTo = detail.YearTo;
if (!string.IsNullOrWhiteSpace(detail.CanonicalUrl))
{
vehicle.SourceDetailUrl = detail.CanonicalUrl;
}
}
}
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] Detail failed: {vehicle.HsnTsn} -> {ex.Message}");
}
}
vehicle.MatchKey = BuildMatchKey(vehicle);
csvWriter.WriteRecord(vehicle);
await csvWriter.NextRecordAsync();
if (written.Count % 250 == 0)
{
await csvWriter.FlushAsync();
}
}
}
static string? GetOptionValue(string[] cliArgs, string optionName)
{
for (var i = 0; i < cliArgs.Length; i++)
{
var arg = cliArgs[i];
if (arg.Equals(optionName, StringComparison.OrdinalIgnoreCase))
{
if (i + 1 < cliArgs.Length)
{
return cliArgs[i + 1];
}
return null;
}
var prefix = optionName + "=";
if (arg.StartsWith(prefix, StringComparison.OrdinalIgnoreCase))
{
return arg[prefix.Length..];
}
}
return null;
}
async Task<VehicleDetail?> GetVehicleDetailWithRetry(HsnTsnClient hsnTsnClient, string detailUrl, string hsnTsn)
{
const int maxAttempts = 7;
Console.Error.WriteLine($"[info] Fetching detail for HSN/TSN: {hsnTsn}");
for (var attempt = 1; attempt <= maxAttempts; attempt++)
{
try
{
return await hsnTsnClient.GetVehicleDetailAsync(detailUrl);
}
catch (HttpRequestException ex) when (IsTooManyRequests(ex) && attempt < maxAttempts)
{
var delaySeconds = Math.Min(60, (int)Math.Pow(2, attempt));
Console.Error.WriteLine($"[warn] 429 for detail url, retrying in {delaySeconds}s (attempt {attempt}/{maxAttempts})");
await Task.Delay(TimeSpan.FromSeconds(delaySeconds));
}
}
return await hsnTsnClient.GetVehicleDetailAsync(detailUrl);
}
static bool IsTooManyRequests(HttpRequestException ex)
{
if (ex.StatusCode == HttpStatusCode.TooManyRequests)
{
return true;
}
return ex.Message.Contains("429", StringComparison.OrdinalIgnoreCase);
}
async IAsyncEnumerable<string> ReadInput()
{
var seen = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
@@ -160,6 +326,6 @@ string BuildMatchKey(HsnTsnVehicle vehicle)
.Replace("Ü", "UE")
.Replace("ß", "SS");
normalized = System.Text.RegularExpressions.Regex.Replace(normalized, @"[^A-Z0-9]+", " ").Trim();
return System.Text.RegularExpressions.Regex.Replace(normalized, @"\s+", " ");
normalized = Regex.Replace(normalized, @"[^A-Z0-9]+", " ").Trim();
return Regex.Replace(normalized, @"\s+", " ");
}