Add repair mode for missing year fields in vehicle data; update README with usage instructions
This commit is contained in:
+3
-1
@@ -1,4 +1,6 @@
|
|||||||
bin/
|
bin/
|
||||||
obj/
|
obj/
|
||||||
.idea/
|
.idea/
|
||||||
hsntsn.csv
|
hsntsn.csv
|
||||||
|
hsntsn.repaired.csv
|
||||||
|
scrape.log
|
||||||
@@ -32,3 +32,9 @@ Enable detail-page enrichment:
|
|||||||
```bash
|
```bash
|
||||||
printf "0588\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --include-details
|
printf "0588\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --include-details
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Repair only missing year fields from an existing CSV:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --repair-years --input-csv hsntsn.csv --output-csv hsntsn.repaired.csv
|
||||||
|
```
|
||||||
|
|||||||
+263
-97
@@ -1,136 +1,302 @@
|
|||||||
using System.Globalization;
|
using System.Globalization;
|
||||||
|
using System.Net;
|
||||||
|
using System.Text.RegularExpressions;
|
||||||
using CsvHelper;
|
using CsvHelper;
|
||||||
using CsvHelper.Configuration;
|
using CsvHelper.Configuration;
|
||||||
using HsnTsnScraper;
|
using HsnTsnScraper;
|
||||||
|
|
||||||
|
var repairYears = args.Contains("--repair-years", StringComparer.OrdinalIgnoreCase);
|
||||||
var includeDetails = args.Contains("--include-details", StringComparer.OrdinalIgnoreCase);
|
var includeDetails = args.Contains("--include-details", StringComparer.OrdinalIgnoreCase);
|
||||||
|
var inputCsv = GetOptionValue(args, "--input-csv");
|
||||||
|
var outputCsv = GetOptionValue(args, "--output-csv");
|
||||||
|
|
||||||
using var client = new HsnTsnClient();
|
using var client = new HsnTsnClient();
|
||||||
var written = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
|
||||||
var processed = 0;
|
|
||||||
var failed = 0;
|
|
||||||
|
|
||||||
Console.Error.WriteLine($"[info] Scrape started. includeDetails={includeDetails}");
|
if (repairYears)
|
||||||
|
|
||||||
await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture)
|
|
||||||
{
|
{
|
||||||
Delimiter = ";"
|
await RunRepairYearsMode(client, inputCsv, outputCsv);
|
||||||
});
|
return;
|
||||||
|
|
||||||
csvWriter.WriteHeader<HsnTsnVehicle>();
|
|
||||||
await csvWriter.NextRecordAsync();
|
|
||||||
await csvWriter.FlushAsync();
|
|
||||||
|
|
||||||
if (Console.IsInputRedirected)
|
|
||||||
{
|
|
||||||
await foreach (var query in ReadInput())
|
|
||||||
{
|
|
||||||
IReadOnlyList<HsnTsnVehicle> vehicles;
|
|
||||||
try
|
|
||||||
{
|
|
||||||
vehicles = await client.GetVehiclesFromSearchAsync(query);
|
|
||||||
}
|
|
||||||
catch (Exception ex)
|
|
||||||
{
|
|
||||||
failed++;
|
|
||||||
Console.Error.WriteLine($"[warn] Query failed: {query} -> {ex.Message}");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach (var vehicle in vehicles)
|
|
||||||
{
|
|
||||||
await WriteVehicleIfNew(vehicle);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
IReadOnlyList<string> brandUrls;
|
|
||||||
try
|
|
||||||
{
|
|
||||||
brandUrls = await client.GetBrandPageUrls();
|
|
||||||
}
|
|
||||||
catch (Exception ex)
|
|
||||||
{
|
|
||||||
Console.Error.WriteLine($"[error] Could not fetch brand urls: {ex.Message}");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach (var url in brandUrls)
|
|
||||||
{
|
|
||||||
Console.Error.WriteLine($"[info] Processing: {url}");
|
|
||||||
IReadOnlyList<HsnTsnVehicle> vehicles;
|
|
||||||
try
|
|
||||||
{
|
|
||||||
vehicles = await client.GetVehiclesFromBrandPageAsync(url);
|
|
||||||
}
|
|
||||||
catch (Exception ex)
|
|
||||||
{
|
|
||||||
failed++;
|
|
||||||
Console.Error.WriteLine($"[warn] Brand page failed: {url} -> {ex.Message}");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
foreach (var vehicle in vehicles)
|
|
||||||
{
|
|
||||||
await WriteVehicleIfNew(vehicle);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
await csvWriter.FlushAsync();
|
await RunScrapeMode(client, includeDetails);
|
||||||
Console.Error.WriteLine($"[info] Scrape finished. written={written.Count}, processed={processed}, failed={failed}");
|
|
||||||
return;
|
return;
|
||||||
|
|
||||||
async Task WriteVehicleIfNew(HsnTsnVehicle vehicle)
|
async Task RunRepairYearsMode(HsnTsnClient hsnTsnClient, string? inputPath, string? outputPath)
|
||||||
{
|
{
|
||||||
processed++;
|
var inputCsvPath = string.IsNullOrWhiteSpace(inputPath) ? "hsntsn.csv" : inputPath;
|
||||||
|
var outputCsvPath = string.IsNullOrWhiteSpace(outputPath) ? "hsntsn.repaired.csv" : outputPath;
|
||||||
|
|
||||||
if (!written.Add(vehicle.HsnTsn))
|
if (Path.GetFullPath(inputCsvPath).Equals(Path.GetFullPath(outputCsvPath), StringComparison.OrdinalIgnoreCase))
|
||||||
{
|
{
|
||||||
|
Console.Error.WriteLine("[error] --input-csv and --output-csv cannot point to the same file.");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (includeDetails && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl))
|
Console.Error.WriteLine($"[info] Repair mode started. input={inputCsvPath}, output={outputCsvPath}");
|
||||||
|
|
||||||
|
var processed = 0;
|
||||||
|
var failed = 0;
|
||||||
|
var updated = 0;
|
||||||
|
|
||||||
|
await using var inputStream = File.OpenRead(inputCsvPath);
|
||||||
|
using var inputReader = new StreamReader(inputStream);
|
||||||
|
using var csvReader = new CsvReader(inputReader, new CsvConfiguration(CultureInfo.InvariantCulture)
|
||||||
{
|
{
|
||||||
try
|
Delimiter = ";",
|
||||||
|
MissingFieldFound = null,
|
||||||
|
HeaderValidated = null
|
||||||
|
});
|
||||||
|
|
||||||
|
await using var outputStream = File.Create(outputCsvPath);
|
||||||
|
await using var outputWriter = new StreamWriter(outputStream);
|
||||||
|
await using var csvWriter = new CsvWriter(outputWriter, new CsvConfiguration(CultureInfo.InvariantCulture)
|
||||||
|
{
|
||||||
|
Delimiter = ";"
|
||||||
|
});
|
||||||
|
|
||||||
|
await csvReader.ReadAsync();
|
||||||
|
csvReader.ReadHeader();
|
||||||
|
|
||||||
|
csvWriter.WriteHeader<HsnTsnVehicle>();
|
||||||
|
await csvWriter.NextRecordAsync();
|
||||||
|
|
||||||
|
await foreach (var record in csvReader.GetRecordsAsync<HsnTsnVehicle>())
|
||||||
|
{
|
||||||
|
processed++;
|
||||||
|
var changed = false;
|
||||||
|
|
||||||
|
if ((record.YearFrom is null || record.YearTo is null) && !string.IsNullOrWhiteSpace(record.SourceDetailUrl))
|
||||||
{
|
{
|
||||||
var detail = await client.GetVehicleDetailAsync(vehicle.SourceDetailUrl);
|
try
|
||||||
if (detail is not null)
|
|
||||||
{
|
{
|
||||||
if (!string.IsNullOrWhiteSpace(detail.Brand))
|
var detail = await GetVehicleDetailWithRetry(hsnTsnClient, record.SourceDetailUrl, record.HsnTsn);
|
||||||
|
if (detail is not null)
|
||||||
{
|
{
|
||||||
vehicle.Brand = detail.Brand;
|
if (record.YearFrom is null && detail.YearFrom is not null)
|
||||||
}
|
{
|
||||||
|
record.YearFrom = detail.YearFrom;
|
||||||
|
changed = true;
|
||||||
|
}
|
||||||
|
|
||||||
vehicle.Model = detail.Model;
|
if (record.YearTo is null && detail.YearTo is not null)
|
||||||
vehicle.OfficialType = detail.OfficialType;
|
{
|
||||||
vehicle.YearFrom = detail.YearFrom;
|
record.YearTo = detail.YearTo;
|
||||||
vehicle.YearTo = detail.YearTo;
|
changed = true;
|
||||||
|
}
|
||||||
if (!string.IsNullOrWhiteSpace(detail.CanonicalUrl))
|
|
||||||
{
|
|
||||||
vehicle.SourceDetailUrl = detail.CanonicalUrl;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
failed++;
|
||||||
|
Console.Error.WriteLine($"[warn] Repair failed: {record.HsnTsn} -> {ex.Message}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (changed)
|
||||||
|
{
|
||||||
|
updated++;
|
||||||
|
}
|
||||||
|
|
||||||
|
record.MatchKey = BuildMatchKey(record);
|
||||||
|
csvWriter.WriteRecord(record);
|
||||||
|
await csvWriter.NextRecordAsync();
|
||||||
|
|
||||||
|
if (processed % 250 == 0)
|
||||||
|
{
|
||||||
|
await csvWriter.FlushAsync();
|
||||||
|
Console.Error.WriteLine($"[info] Repair progress: processed={processed}, updated={updated}, failed={failed}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await csvWriter.FlushAsync();
|
||||||
|
Console.Error.WriteLine($"[info] Repair finished. processed={processed}, updated={updated}, failed={failed}");
|
||||||
|
}
|
||||||
|
|
||||||
|
async Task RunScrapeMode(HsnTsnClient hsnTsnClient, bool includeDetailPages)
|
||||||
|
{
|
||||||
|
var written = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||||
|
var processed = 0;
|
||||||
|
var failed = 0;
|
||||||
|
|
||||||
|
Console.Error.WriteLine($"[info] Scrape started. includeDetails={includeDetailPages}");
|
||||||
|
|
||||||
|
await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture)
|
||||||
|
{
|
||||||
|
Delimiter = ";"
|
||||||
|
});
|
||||||
|
|
||||||
|
csvWriter.WriteHeader<HsnTsnVehicle>();
|
||||||
|
await csvWriter.NextRecordAsync();
|
||||||
|
await csvWriter.FlushAsync();
|
||||||
|
|
||||||
|
if (Console.IsInputRedirected)
|
||||||
|
{
|
||||||
|
await foreach (var query in ReadInput())
|
||||||
|
{
|
||||||
|
IReadOnlyList<HsnTsnVehicle> vehicles;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
vehicles = await hsnTsnClient.GetVehiclesFromSearchAsync(query);
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
failed++;
|
||||||
|
Console.Error.WriteLine($"[warn] Query failed: {query} -> {ex.Message}");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach (var vehicle in vehicles)
|
||||||
|
{
|
||||||
|
await WriteVehicleIfNew(vehicle);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
IReadOnlyList<string> brandUrls;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
brandUrls = await hsnTsnClient.GetBrandPageUrls();
|
||||||
}
|
}
|
||||||
catch (Exception ex)
|
catch (Exception ex)
|
||||||
{
|
{
|
||||||
failed++;
|
Console.Error.WriteLine($"[error] Could not fetch brand urls: {ex.Message}");
|
||||||
Console.Error.WriteLine($"[warn] Detail failed: {vehicle.HsnTsn} -> {ex.Message}");
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach (var url in brandUrls)
|
||||||
|
{
|
||||||
|
Console.Error.WriteLine($"[info] Processing: {url}");
|
||||||
|
IReadOnlyList<HsnTsnVehicle> vehicles;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
vehicles = await hsnTsnClient.GetVehiclesFromBrandPageAsync(url);
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
failed++;
|
||||||
|
Console.Error.WriteLine($"[warn] Brand page failed: {url} -> {ex.Message}");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach (var vehicle in vehicles)
|
||||||
|
{
|
||||||
|
await WriteVehicleIfNew(vehicle);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
vehicle.MatchKey = BuildMatchKey(vehicle);
|
await csvWriter.FlushAsync();
|
||||||
csvWriter.WriteRecord(vehicle);
|
Console.Error.WriteLine($"[info] Scrape finished. written={written.Count}, processed={processed}, failed={failed}");
|
||||||
await csvWriter.NextRecordAsync();
|
return;
|
||||||
|
|
||||||
if (written.Count % 250 == 0)
|
async Task WriteVehicleIfNew(HsnTsnVehicle vehicle)
|
||||||
{
|
{
|
||||||
await csvWriter.FlushAsync();
|
processed++;
|
||||||
|
|
||||||
|
if (!written.Add(vehicle.HsnTsn))
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (includeDetailPages && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl))
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var detail = await GetVehicleDetailWithRetry(hsnTsnClient, vehicle.SourceDetailUrl, vehicle.HsnTsn);
|
||||||
|
if (detail is not null)
|
||||||
|
{
|
||||||
|
if (!string.IsNullOrWhiteSpace(detail.Brand))
|
||||||
|
{
|
||||||
|
vehicle.Brand = detail.Brand;
|
||||||
|
}
|
||||||
|
|
||||||
|
vehicle.Model = detail.Model;
|
||||||
|
vehicle.OfficialType = detail.OfficialType;
|
||||||
|
vehicle.YearFrom = detail.YearFrom;
|
||||||
|
vehicle.YearTo = detail.YearTo;
|
||||||
|
|
||||||
|
if (!string.IsNullOrWhiteSpace(detail.CanonicalUrl))
|
||||||
|
{
|
||||||
|
vehicle.SourceDetailUrl = detail.CanonicalUrl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
failed++;
|
||||||
|
Console.Error.WriteLine($"[warn] Detail failed: {vehicle.HsnTsn} -> {ex.Message}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
vehicle.MatchKey = BuildMatchKey(vehicle);
|
||||||
|
csvWriter.WriteRecord(vehicle);
|
||||||
|
await csvWriter.NextRecordAsync();
|
||||||
|
|
||||||
|
if (written.Count % 250 == 0)
|
||||||
|
{
|
||||||
|
await csvWriter.FlushAsync();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static string? GetOptionValue(string[] cliArgs, string optionName)
|
||||||
|
{
|
||||||
|
for (var i = 0; i < cliArgs.Length; i++)
|
||||||
|
{
|
||||||
|
var arg = cliArgs[i];
|
||||||
|
if (arg.Equals(optionName, StringComparison.OrdinalIgnoreCase))
|
||||||
|
{
|
||||||
|
if (i + 1 < cliArgs.Length)
|
||||||
|
{
|
||||||
|
return cliArgs[i + 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
var prefix = optionName + "=";
|
||||||
|
if (arg.StartsWith(prefix, StringComparison.OrdinalIgnoreCase))
|
||||||
|
{
|
||||||
|
return arg[prefix.Length..];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
async Task<VehicleDetail?> GetVehicleDetailWithRetry(HsnTsnClient hsnTsnClient, string detailUrl, string hsnTsn)
|
||||||
|
{
|
||||||
|
const int maxAttempts = 7;
|
||||||
|
Console.Error.WriteLine($"[info] Fetching detail for HSN/TSN: {hsnTsn}");
|
||||||
|
|
||||||
|
for (var attempt = 1; attempt <= maxAttempts; attempt++)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
return await hsnTsnClient.GetVehicleDetailAsync(detailUrl);
|
||||||
|
}
|
||||||
|
catch (HttpRequestException ex) when (IsTooManyRequests(ex) && attempt < maxAttempts)
|
||||||
|
{
|
||||||
|
var delaySeconds = Math.Min(60, (int)Math.Pow(2, attempt));
|
||||||
|
Console.Error.WriteLine($"[warn] 429 for detail url, retrying in {delaySeconds}s (attempt {attempt}/{maxAttempts})");
|
||||||
|
await Task.Delay(TimeSpan.FromSeconds(delaySeconds));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return await hsnTsnClient.GetVehicleDetailAsync(detailUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool IsTooManyRequests(HttpRequestException ex)
|
||||||
|
{
|
||||||
|
if (ex.StatusCode == HttpStatusCode.TooManyRequests)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ex.Message.Contains("429", StringComparison.OrdinalIgnoreCase);
|
||||||
|
}
|
||||||
|
|
||||||
async IAsyncEnumerable<string> ReadInput()
|
async IAsyncEnumerable<string> ReadInput()
|
||||||
{
|
{
|
||||||
var seen = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
var seen = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||||
@@ -160,6 +326,6 @@ string BuildMatchKey(HsnTsnVehicle vehicle)
|
|||||||
.Replace("Ü", "UE")
|
.Replace("Ü", "UE")
|
||||||
.Replace("ß", "SS");
|
.Replace("ß", "SS");
|
||||||
|
|
||||||
normalized = System.Text.RegularExpressions.Regex.Replace(normalized, @"[^A-Z0-9]+", " ").Trim();
|
normalized = Regex.Replace(normalized, @"[^A-Z0-9]+", " ").Trim();
|
||||||
return System.Text.RegularExpressions.Regex.Replace(normalized, @"\s+", " ");
|
return Regex.Replace(normalized, @"\s+", " ");
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user