Add repair mode for missing year fields in vehicle data; update README with usage instructions
This commit is contained in:
@@ -2,3 +2,5 @@ bin/
|
||||
obj/
|
||||
.idea/
|
||||
hsntsn.csv
|
||||
hsntsn.repaired.csv
|
||||
scrape.log
|
||||
@@ -32,3 +32,9 @@ Enable detail-page enrichment:
|
||||
```bash
|
||||
printf "0588\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --include-details
|
||||
```
|
||||
|
||||
Repair only missing year fields from an existing CSV:
|
||||
|
||||
```bash
|
||||
dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --repair-years --input-csv hsntsn.csv --output-csv hsntsn.repaired.csv
|
||||
```
|
||||
|
||||
@@ -1,16 +1,124 @@
|
||||
using System.Globalization;
|
||||
using System.Net;
|
||||
using System.Text.RegularExpressions;
|
||||
using CsvHelper;
|
||||
using CsvHelper.Configuration;
|
||||
using HsnTsnScraper;
|
||||
|
||||
var repairYears = args.Contains("--repair-years", StringComparer.OrdinalIgnoreCase);
|
||||
var includeDetails = args.Contains("--include-details", StringComparer.OrdinalIgnoreCase);
|
||||
var inputCsv = GetOptionValue(args, "--input-csv");
|
||||
var outputCsv = GetOptionValue(args, "--output-csv");
|
||||
|
||||
using var client = new HsnTsnClient();
|
||||
|
||||
if (repairYears)
|
||||
{
|
||||
await RunRepairYearsMode(client, inputCsv, outputCsv);
|
||||
return;
|
||||
}
|
||||
|
||||
await RunScrapeMode(client, includeDetails);
|
||||
return;
|
||||
|
||||
async Task RunRepairYearsMode(HsnTsnClient hsnTsnClient, string? inputPath, string? outputPath)
|
||||
{
|
||||
var inputCsvPath = string.IsNullOrWhiteSpace(inputPath) ? "hsntsn.csv" : inputPath;
|
||||
var outputCsvPath = string.IsNullOrWhiteSpace(outputPath) ? "hsntsn.repaired.csv" : outputPath;
|
||||
|
||||
if (Path.GetFullPath(inputCsvPath).Equals(Path.GetFullPath(outputCsvPath), StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
Console.Error.WriteLine("[error] --input-csv and --output-csv cannot point to the same file.");
|
||||
return;
|
||||
}
|
||||
|
||||
Console.Error.WriteLine($"[info] Repair mode started. input={inputCsvPath}, output={outputCsvPath}");
|
||||
|
||||
var processed = 0;
|
||||
var failed = 0;
|
||||
var updated = 0;
|
||||
|
||||
await using var inputStream = File.OpenRead(inputCsvPath);
|
||||
using var inputReader = new StreamReader(inputStream);
|
||||
using var csvReader = new CsvReader(inputReader, new CsvConfiguration(CultureInfo.InvariantCulture)
|
||||
{
|
||||
Delimiter = ";",
|
||||
MissingFieldFound = null,
|
||||
HeaderValidated = null
|
||||
});
|
||||
|
||||
await using var outputStream = File.Create(outputCsvPath);
|
||||
await using var outputWriter = new StreamWriter(outputStream);
|
||||
await using var csvWriter = new CsvWriter(outputWriter, new CsvConfiguration(CultureInfo.InvariantCulture)
|
||||
{
|
||||
Delimiter = ";"
|
||||
});
|
||||
|
||||
await csvReader.ReadAsync();
|
||||
csvReader.ReadHeader();
|
||||
|
||||
csvWriter.WriteHeader<HsnTsnVehicle>();
|
||||
await csvWriter.NextRecordAsync();
|
||||
|
||||
await foreach (var record in csvReader.GetRecordsAsync<HsnTsnVehicle>())
|
||||
{
|
||||
processed++;
|
||||
var changed = false;
|
||||
|
||||
if ((record.YearFrom is null || record.YearTo is null) && !string.IsNullOrWhiteSpace(record.SourceDetailUrl))
|
||||
{
|
||||
try
|
||||
{
|
||||
var detail = await GetVehicleDetailWithRetry(hsnTsnClient, record.SourceDetailUrl, record.HsnTsn);
|
||||
if (detail is not null)
|
||||
{
|
||||
if (record.YearFrom is null && detail.YearFrom is not null)
|
||||
{
|
||||
record.YearFrom = detail.YearFrom;
|
||||
changed = true;
|
||||
}
|
||||
|
||||
if (record.YearTo is null && detail.YearTo is not null)
|
||||
{
|
||||
record.YearTo = detail.YearTo;
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
failed++;
|
||||
Console.Error.WriteLine($"[warn] Repair failed: {record.HsnTsn} -> {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
if (changed)
|
||||
{
|
||||
updated++;
|
||||
}
|
||||
|
||||
record.MatchKey = BuildMatchKey(record);
|
||||
csvWriter.WriteRecord(record);
|
||||
await csvWriter.NextRecordAsync();
|
||||
|
||||
if (processed % 250 == 0)
|
||||
{
|
||||
await csvWriter.FlushAsync();
|
||||
Console.Error.WriteLine($"[info] Repair progress: processed={processed}, updated={updated}, failed={failed}");
|
||||
}
|
||||
}
|
||||
|
||||
await csvWriter.FlushAsync();
|
||||
Console.Error.WriteLine($"[info] Repair finished. processed={processed}, updated={updated}, failed={failed}");
|
||||
}
|
||||
|
||||
async Task RunScrapeMode(HsnTsnClient hsnTsnClient, bool includeDetailPages)
|
||||
{
|
||||
var written = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
var processed = 0;
|
||||
var failed = 0;
|
||||
|
||||
Console.Error.WriteLine($"[info] Scrape started. includeDetails={includeDetails}");
|
||||
Console.Error.WriteLine($"[info] Scrape started. includeDetails={includeDetailPages}");
|
||||
|
||||
await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture)
|
||||
{
|
||||
@@ -28,7 +136,7 @@ if (Console.IsInputRedirected)
|
||||
IReadOnlyList<HsnTsnVehicle> vehicles;
|
||||
try
|
||||
{
|
||||
vehicles = await client.GetVehiclesFromSearchAsync(query);
|
||||
vehicles = await hsnTsnClient.GetVehiclesFromSearchAsync(query);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
@@ -48,7 +156,7 @@ else
|
||||
IReadOnlyList<string> brandUrls;
|
||||
try
|
||||
{
|
||||
brandUrls = await client.GetBrandPageUrls();
|
||||
brandUrls = await hsnTsnClient.GetBrandPageUrls();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
@@ -62,7 +170,7 @@ else
|
||||
IReadOnlyList<HsnTsnVehicle> vehicles;
|
||||
try
|
||||
{
|
||||
vehicles = await client.GetVehiclesFromBrandPageAsync(url);
|
||||
vehicles = await hsnTsnClient.GetVehiclesFromBrandPageAsync(url);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
@@ -91,11 +199,11 @@ async Task WriteVehicleIfNew(HsnTsnVehicle vehicle)
|
||||
return;
|
||||
}
|
||||
|
||||
if (includeDetails && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl))
|
||||
if (includeDetailPages && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl))
|
||||
{
|
||||
try
|
||||
{
|
||||
var detail = await client.GetVehicleDetailAsync(vehicle.SourceDetailUrl);
|
||||
var detail = await GetVehicleDetailWithRetry(hsnTsnClient, vehicle.SourceDetailUrl, vehicle.HsnTsn);
|
||||
if (detail is not null)
|
||||
{
|
||||
if (!string.IsNullOrWhiteSpace(detail.Brand))
|
||||
@@ -130,6 +238,64 @@ async Task WriteVehicleIfNew(HsnTsnVehicle vehicle)
|
||||
await csvWriter.FlushAsync();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static string? GetOptionValue(string[] cliArgs, string optionName)
|
||||
{
|
||||
for (var i = 0; i < cliArgs.Length; i++)
|
||||
{
|
||||
var arg = cliArgs[i];
|
||||
if (arg.Equals(optionName, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
if (i + 1 < cliArgs.Length)
|
||||
{
|
||||
return cliArgs[i + 1];
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
var prefix = optionName + "=";
|
||||
if (arg.StartsWith(prefix, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return arg[prefix.Length..];
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async Task<VehicleDetail?> GetVehicleDetailWithRetry(HsnTsnClient hsnTsnClient, string detailUrl, string hsnTsn)
|
||||
{
|
||||
const int maxAttempts = 7;
|
||||
Console.Error.WriteLine($"[info] Fetching detail for HSN/TSN: {hsnTsn}");
|
||||
|
||||
for (var attempt = 1; attempt <= maxAttempts; attempt++)
|
||||
{
|
||||
try
|
||||
{
|
||||
return await hsnTsnClient.GetVehicleDetailAsync(detailUrl);
|
||||
}
|
||||
catch (HttpRequestException ex) when (IsTooManyRequests(ex) && attempt < maxAttempts)
|
||||
{
|
||||
var delaySeconds = Math.Min(60, (int)Math.Pow(2, attempt));
|
||||
Console.Error.WriteLine($"[warn] 429 for detail url, retrying in {delaySeconds}s (attempt {attempt}/{maxAttempts})");
|
||||
await Task.Delay(TimeSpan.FromSeconds(delaySeconds));
|
||||
}
|
||||
}
|
||||
|
||||
return await hsnTsnClient.GetVehicleDetailAsync(detailUrl);
|
||||
}
|
||||
|
||||
static bool IsTooManyRequests(HttpRequestException ex)
|
||||
{
|
||||
if (ex.StatusCode == HttpStatusCode.TooManyRequests)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return ex.Message.Contains("429", StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
async IAsyncEnumerable<string> ReadInput()
|
||||
{
|
||||
@@ -160,6 +326,6 @@ string BuildMatchKey(HsnTsnVehicle vehicle)
|
||||
.Replace("Ü", "UE")
|
||||
.Replace("ß", "SS");
|
||||
|
||||
normalized = System.Text.RegularExpressions.Regex.Replace(normalized, @"[^A-Z0-9]+", " ").Trim();
|
||||
return System.Text.RegularExpressions.Regex.Replace(normalized, @"\s+", " ");
|
||||
normalized = Regex.Replace(normalized, @"[^A-Z0-9]+", " ").Trim();
|
||||
return Regex.Replace(normalized, @"\s+", " ");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user