Add Autoampel scraping functionality and enhance vehicle data processing

This commit is contained in:
2026-03-05 00:44:00 +03:00
parent c7750ac4ca
commit 223da27094
4 changed files with 1086 additions and 7 deletions
+21
View File
@@ -21,6 +21,12 @@ Scrape all brand pages:
dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv
``` ```
Scrape directly from Autoampel typklassen pages (no hsn-tsn redirect chain):
```bash
dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --source autoampel > hsntsn.csv
```
Scrape only specific queries from `stdin`: Scrape only specific queries from `stdin`:
```bash ```bash
@@ -38,3 +44,18 @@ Repair only missing year fields from an existing CSV:
```bash ```bash
dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --repair-years --input-csv hsntsn.csv --output-csv hsntsn.repaired.csv dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --repair-years --input-csv hsntsn.csv --output-csv hsntsn.repaired.csv
``` ```
Merge core fields by `HsnTsn` and write to PostgreSQL (priority: `hsn-tsn.de` then `autoampel.de`):
```bash
dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --merge-core-db --pg-connection "Host=localhost;Port=5432;Database=hsntsn;Username=hsntsn;Password=hsntsn" --pg-table public.hsntsn_vehicle
```
You can also pass the connection via environment variable:
```bash
export HSNTSN_PG="Host=localhost;Port=5432;Database=hsntsn;Username=hsntsn;Password=hsntsn"
dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --merge-core-db
```
Optional: if you already have a CSV, you can still seed from it with `--input-csv hsntsn.csv`.
+149 -3
View File
@@ -9,8 +9,10 @@ namespace HsnTsnScraper;
public sealed class HsnTsnClient : IDisposable public sealed class HsnTsnClient : IDisposable
{ {
private const string AutoampelBaseUrl = "https://www.autoampel.de/";
private static readonly Regex HsnTsnRegex = new(@"(?<hsn>\d{4})\s*/\s*(?<tsn>[A-Z0-9]{3})", RegexOptions.Compiled); private static readonly Regex HsnTsnRegex = new(@"(?<hsn>\d{4})\s*/\s*(?<tsn>[A-Z0-9]{3})", RegexOptions.Compiled);
private static readonly Regex PsKwRegex = new(@"(?<ps>\d+)\s*PS\s*\((?<kw>\d+)\s*kW\)", RegexOptions.Compiled | RegexOptions.IgnoreCase); private static readonly Regex PsKwRegex = new(@"(?<ps>\d+)\s*PS\s*\((?<kw>\d+)\s*kW\)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private static readonly Regex PsKwShortRegex = new(@"(?<ps>\d+)\s*\((?<kw>\d+)\)", RegexOptions.Compiled);
private static readonly Regex CcmRegex = new(@"(?<ccm>[\d\.\,]+)\s*ccm", RegexOptions.Compiled | RegexOptions.IgnoreCase); private static readonly Regex CcmRegex = new(@"(?<ccm>[\d\.\,]+)\s*ccm", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private static readonly Regex YearRangeRegex = new(@"(?<from>\d{4})(?:\D+(?<to>\d{4}))?", RegexOptions.Compiled); private static readonly Regex YearRangeRegex = new(@"(?<from>\d{4})(?:\D+(?<to>\d{4}))?", RegexOptions.Compiled);
private static readonly HashSet<string> ExcludedLinks = new(StringComparer.OrdinalIgnoreCase) private static readonly HashSet<string> ExcludedLinks = new(StringComparer.OrdinalIgnoreCase)
@@ -101,6 +103,116 @@ public sealed class HsnTsnClient : IDisposable
return ParseVehiclesFromListPage(html, absoluteUrl, null); return ParseVehiclesFromListPage(html, absoluteUrl, null);
} }
public async Task<IReadOnlyList<HsnTsnVehicle>> GetVehiclesByHsnFromHsnTsnAsync(string hsn, CancellationToken cancellationToken = default)
{
if (string.IsNullOrWhiteSpace(hsn))
{
return Array.Empty<HsnTsnVehicle>();
}
var encoded = Uri.EscapeDataString(hsn.Trim());
var url = $"/liste.php?string={encoded}";
var html = await GetStringAsync(url, cancellationToken);
return ParseVehiclesFromListPage(html, new Uri(_client.BaseAddress!, url).ToString(), hsn)
.Where(v => v.Hsn.Equals(hsn, StringComparison.OrdinalIgnoreCase))
.ToArray();
}
public string GetAutoampelFullListUrl()
{
return $"{AutoampelBaseUrl}typklassen/liste";
}
public async Task<AutoampelPageResult> GetVehiclesFromAutoampelListPageAsync(string absoluteUrl, CancellationToken cancellationToken = default)
{
var html = await GetStringAsync(absoluteUrl, cancellationToken);
var doc = LoadDocument(html);
var rows = doc.DocumentNode.SelectNodes("//table[contains(@class,'autolist')]//tbody/tr") ?? new HtmlNodeCollection(null);
var result = new List<HsnTsnVehicle>();
foreach (var row in rows)
{
var cells = row.SelectNodes("./td");
if (cells is null || cells.Count < 9)
{
continue;
}
var hsnTsnText = Clean(cells[8].InnerText);
var match = HsnTsnRegex.Match(hsnTsnText);
if (!match.Success)
{
continue;
}
var vehicleTypeRaw = Clean(cells[3].InnerText);
var detailHref = cells[3].SelectSingleNode(".//a[@href]")?.GetAttributeValue("href", string.Empty) ?? string.Empty;
var detailUrl = ToAbsoluteUrl(detailHref, AutoampelBaseUrl);
var yearText = Clean(cells[4].InnerText);
ParseYearRange(yearText, out var yearFrom, out var yearTo);
var powerText = Clean(cells[5].InnerText);
ParsePowerAutoampel(powerText, out var ps, out var kw);
var displacementText = Clean(cells[6].InnerText);
var displacementCcm = ParseDisplacementAutoampel(displacementText);
var fuelType = Clean(cells[7].InnerText);
var brand = ExtractBrand(vehicleTypeRaw);
var vehicle = new HsnTsnVehicle
{
Hsn = match.Groups["hsn"].Value,
Tsn = match.Groups["tsn"].Value,
HsnTsn = $"{match.Groups["hsn"].Value}/{match.Groups["tsn"].Value}",
Brand = brand,
VehicleType = vehicleTypeRaw,
YearFrom = yearFrom,
YearTo = yearTo,
PowerPs = ps,
PowerKw = kw,
DisplacementCcm = displacementCcm,
FuelType = fuelType,
SourceListUrl = absoluteUrl,
SourceDetailUrl = detailUrl
};
vehicle.MatchKey = BuildMatchKey(vehicle);
result.Add(vehicle);
}
var nextHref = doc.DocumentNode.SelectSingleNode("//ul[@id='pagination']//a[@rel='next']")?.GetAttributeValue("href", string.Empty) ?? string.Empty;
var nextUrl = string.IsNullOrWhiteSpace(nextHref) ? null : ToAbsoluteUrl(nextHref, AutoampelBaseUrl);
return new AutoampelPageResult
{
Vehicles = result,
NextPageUrl = nextUrl
};
}
public async Task<IReadOnlyList<HsnTsnVehicle>> GetVehiclesByHsnFromAutoampelAsync(string hsn, CancellationToken cancellationToken = default)
{
if (string.IsNullOrWhiteSpace(hsn))
{
return Array.Empty<HsnTsnVehicle>();
}
var startUrl = $"{AutoampelBaseUrl}typklassen/hsn-{hsn}";
var currentUrl = startUrl;
var all = new List<HsnTsnVehicle>();
while (!string.IsNullOrWhiteSpace(currentUrl))
{
var page = await GetVehiclesFromAutoampelListPageAsync(currentUrl, cancellationToken);
all.AddRange(page.Vehicles.Where(v => v.Hsn.Equals(hsn, StringComparison.OrdinalIgnoreCase)));
currentUrl = page.NextPageUrl;
}
return all;
}
public async Task<VehicleDetail?> GetVehicleDetailAsync(string detailUrl, CancellationToken cancellationToken = default) public async Task<VehicleDetail?> GetVehicleDetailAsync(string detailUrl, CancellationToken cancellationToken = default)
{ {
if (string.IsNullOrWhiteSpace(detailUrl)) if (string.IsNullOrWhiteSpace(detailUrl))
@@ -178,7 +290,7 @@ public sealed class HsnTsnClient : IDisposable
var vehicleTypeRaw = Clean(cells[1].InnerText); var vehicleTypeRaw = Clean(cells[1].InnerText);
var detailHref = cells[1].SelectSingleNode(".//a[@href]")?.GetAttributeValue("href", string.Empty) ?? string.Empty; var detailHref = cells[1].SelectSingleNode(".//a[@href]")?.GetAttributeValue("href", string.Empty) ?? string.Empty;
var detailUrl = ToAbsoluteUrl(detailHref); var detailUrl = ToAbsoluteUrl(detailHref, _client.BaseAddress!.ToString());
var powerText = Clean(cells[2].InnerText); var powerText = Clean(cells[2].InnerText);
ParsePower(powerText, out var ps, out var kw); ParsePower(powerText, out var ps, out var kw);
@@ -212,7 +324,7 @@ public sealed class HsnTsnClient : IDisposable
return result; return result;
} }
private string ToAbsoluteUrl(string href) private static string ToAbsoluteUrl(string href, string baseUrl)
{ {
if (string.IsNullOrWhiteSpace(href)) if (string.IsNullOrWhiteSpace(href))
{ {
@@ -227,7 +339,7 @@ public sealed class HsnTsnClient : IDisposable
} }
var normalized = href.StartsWith("/", StringComparison.Ordinal) ? href : $"/{href.TrimStart('/')}"; var normalized = href.StartsWith("/", StringComparison.Ordinal) ? href : $"/{href.TrimStart('/')}";
return new Uri(_client.BaseAddress!, normalized).ToString(); return new Uri(new Uri(baseUrl), normalized).ToString();
} }
private static HtmlDocument LoadDocument(string html) private static HtmlDocument LoadDocument(string html)
@@ -264,6 +376,28 @@ public sealed class HsnTsnClient : IDisposable
} }
} }
private static void ParsePowerAutoampel(string powerText, out int? ps, out int? kw)
{
ps = null;
kw = null;
var match = PsKwShortRegex.Match(powerText);
if (!match.Success)
{
return;
}
if (int.TryParse(match.Groups["ps"].Value, out var psParsed))
{
ps = psParsed;
}
if (int.TryParse(match.Groups["kw"].Value, out var kwParsed))
{
kw = kwParsed;
}
}
private static int? ParseDisplacement(string text) private static int? ParseDisplacement(string text)
{ {
var match = CcmRegex.Match(text); var match = CcmRegex.Match(text);
@@ -276,6 +410,12 @@ public sealed class HsnTsnClient : IDisposable
return int.TryParse(numeric, out var ccm) ? ccm : null; return int.TryParse(numeric, out var ccm) ? ccm : null;
} }
private static int? ParseDisplacementAutoampel(string text)
{
var numeric = Regex.Replace(text, @"[^\d]", string.Empty);
return int.TryParse(numeric, out var ccm) ? ccm : null;
}
private static void ParseYearRange(string yearText, out int? fromYear, out int? toYear) private static void ParseYearRange(string yearText, out int? fromYear, out int? toYear)
{ {
fromYear = null; fromYear = null;
@@ -367,6 +507,12 @@ public sealed class HsnTsnClient : IDisposable
} }
} }
public sealed class AutoampelPageResult
{
public IReadOnlyList<HsnTsnVehicle> Vehicles { get; init; } = Array.Empty<HsnTsnVehicle>();
public string? NextPageUrl { get; init; }
}
public sealed class VehicleDetail public sealed class VehicleDetail
{ {
public string Hsn { get; set; } = string.Empty; public string Hsn { get; set; } = string.Empty;
+1
View File
@@ -12,6 +12,7 @@
<PackageReference Include="HtmlAgilityPack" Version="1.11.71" /> <PackageReference Include="HtmlAgilityPack" Version="1.11.71" />
<PackageReference Include="Microsoft.Extensions.Http" Version="8.0.1" /> <PackageReference Include="Microsoft.Extensions.Http" Version="8.0.1" />
<PackageReference Include="Microsoft.Extensions.Http.Polly" Version="8.0.10" /> <PackageReference Include="Microsoft.Extensions.Http.Polly" Version="8.0.10" />
<PackageReference Include="Npgsql" Version="8.0.5" />
</ItemGroup> </ItemGroup>
</Project> </Project>
+915 -4
View File
@@ -4,23 +4,867 @@ using System.Text.RegularExpressions;
using CsvHelper; using CsvHelper;
using CsvHelper.Configuration; using CsvHelper.Configuration;
using HsnTsnScraper; using HsnTsnScraper;
using Npgsql;
var repairYears = args.Contains("--repair-years", StringComparer.OrdinalIgnoreCase); var repairYears = args.Contains("--repair-years", StringComparer.OrdinalIgnoreCase);
var mergeCore = args.Contains("--merge-core", StringComparer.OrdinalIgnoreCase);
var mergeCoreDb = args.Contains("--merge-core-db", StringComparer.OrdinalIgnoreCase);
var includeDetails = args.Contains("--include-details", StringComparer.OrdinalIgnoreCase); var includeDetails = args.Contains("--include-details", StringComparer.OrdinalIgnoreCase);
var inputCsv = GetOptionValue(args, "--input-csv"); var inputCsv = GetOptionValue(args, "--input-csv");
var outputCsv = GetOptionValue(args, "--output-csv"); var outputCsv = GetOptionValue(args, "--output-csv");
var pgConnection = GetOptionValue(args, "--pg-connection");
var pgTable = GetOptionValue(args, "--pg-table") ?? "public.hsntsn_vehicle";
var source = (GetOptionValue(args, "--source") ?? "hsntsn").Trim().ToLowerInvariant();
using var client = new HsnTsnClient(); using var client = new HsnTsnClient();
if (mergeCore)
{
await RunMergeCoreMode(client, inputCsv, outputCsv);
return;
}
if (mergeCoreDb)
{
await RunMergeCoreDbMode(client, inputCsv, pgConnection, pgTable);
return;
}
if (repairYears) if (repairYears)
{ {
await RunRepairYearsMode(client, inputCsv, outputCsv); await RunRepairYearsMode(client, inputCsv, outputCsv);
return; return;
} }
await RunScrapeMode(client, includeDetails); await RunScrapeMode(client, includeDetails, source);
return; return;
async Task RunMergeCoreMode(HsnTsnClient hsnTsnClient, string? inputPath, string? outputPath)
{
var inputCsvPath = string.IsNullOrWhiteSpace(inputPath) ? "hsntsn.csv" : inputPath;
var outputCsvPath = string.IsNullOrWhiteSpace(outputPath) ? "hsntsn.core.csv" : outputPath;
if (Path.GetFullPath(inputCsvPath).Equals(Path.GetFullPath(outputCsvPath), StringComparison.OrdinalIgnoreCase))
{
Console.Error.WriteLine("[error] --input-csv and --output-csv cannot point to the same file.");
return;
}
Console.Error.WriteLine($"[info] Merge-core mode started. input={inputCsvPath}, output={outputCsvPath}");
var map = new Dictionary<string, CoreOutputRow>(StringComparer.OrdinalIgnoreCase);
await using (var inputStream = File.OpenRead(inputCsvPath))
using (var inputReader = new StreamReader(inputStream))
using (var csvReader = new CsvReader(inputReader, new CsvConfiguration(CultureInfo.InvariantCulture)
{
Delimiter = ";",
MissingFieldFound = null,
HeaderValidated = null
}))
{
await csvReader.ReadAsync();
csvReader.ReadHeader();
await foreach (var record in csvReader.GetRecordsAsync<HsnTsnVehicle>())
{
var key = record.HsnTsn?.Trim();
if (string.IsNullOrWhiteSpace(key))
{
continue;
}
if (!map.TryGetValue(key, out var row))
{
row = new CoreOutputRow
{
HsnTsn = key,
Hsn = record.Hsn,
Tsn = record.Tsn,
Brand = record.Brand,
Model = !string.IsNullOrWhiteSpace(record.Model) ? record.Model : record.VehicleType,
YearFrom = record.YearFrom,
YearTo = record.YearTo
};
map[key] = row;
}
else
{
if (string.IsNullOrWhiteSpace(row.Hsn) && !string.IsNullOrWhiteSpace(record.Hsn)) row.Hsn = record.Hsn;
if (string.IsNullOrWhiteSpace(row.Tsn) && !string.IsNullOrWhiteSpace(record.Tsn)) row.Tsn = record.Tsn;
if (string.IsNullOrWhiteSpace(row.Brand) && !string.IsNullOrWhiteSpace(record.Brand)) row.Brand = record.Brand;
if (string.IsNullOrWhiteSpace(row.Model) && !string.IsNullOrWhiteSpace(record.Model)) row.Model = record.Model;
if (row.YearFrom is null && record.YearFrom is not null) row.YearFrom = record.YearFrom;
if (row.YearTo is null && record.YearTo is not null) row.YearTo = record.YearTo;
}
}
}
var byHsn = map.Values
.Where(x => !string.IsNullOrWhiteSpace(x.Hsn))
.GroupBy(x => x.Hsn, StringComparer.OrdinalIgnoreCase)
.OrderBy(x => x.Key, StringComparer.OrdinalIgnoreCase)
.ToList();
var hsnIndex = 0;
var filledFromHsnTsn = 0;
var filledFromAutoampel = 0;
var failed = 0;
Dictionary<string, HsnTsnVehicle>? autoampelIndex = null;
foreach (var hsnGroup in byHsn)
{
hsnIndex++;
var hsn = hsnGroup.Key;
Console.Error.WriteLine($"[info] [{hsnIndex}/{byHsn.Count}] HSN {hsn}: checking hsn-tsn.de");
var yearMissingBeforeHsnTsn = hsnGroup.Count(r => r.YearFrom is null || r.YearTo is null);
Dictionary<string, HsnTsnVehicle> fromHsnTsn;
try
{
fromHsnTsn = (await hsnTsnClient.GetVehiclesByHsnFromHsnTsnAsync(hsn))
.GroupBy(v => v.HsnTsn, StringComparer.OrdinalIgnoreCase)
.Select(g => g.First())
.ToDictionary(v => v.HsnTsn, v => v, StringComparer.OrdinalIgnoreCase);
}
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] HSN {hsn} failed on hsn-tsn.de -> {ex.Message}");
fromHsnTsn = new Dictionary<string, HsnTsnVehicle>(StringComparer.OrdinalIgnoreCase);
}
foreach (var row in hsnGroup)
{
if (fromHsnTsn.TryGetValue(row.HsnTsn, out var sourceRow))
{
filledFromHsnTsn += FillCoreFields(row, sourceRow, fillYears: false);
}
}
var matchedHsnTsn = hsnGroup.Count(r => fromHsnTsn.ContainsKey(r.HsnTsn));
var yearMissingAfterHsnTsn = hsnGroup.Count(r => r.YearFrom is null || r.YearTo is null);
var yearFilledByHsnTsn = Math.Max(0, yearMissingBeforeHsnTsn - yearMissingAfterHsnTsn);
Console.Error.WriteLine($"[info] [{hsnIndex}/{byHsn.Count}] HSN {hsn}: hsn-tsn matched={matchedHsnTsn}, year_filled={yearFilledByHsnTsn}, still_missing={yearMissingAfterHsnTsn}");
var needsAutoampel = hsnGroup.Any(r => r.YearFrom is null || r.YearTo is null || string.IsNullOrWhiteSpace(r.Brand) || string.IsNullOrWhiteSpace(r.Model));
if (!needsAutoampel)
{
continue;
}
Console.Error.WriteLine($"[info] [{hsnIndex}/{byHsn.Count}] HSN {hsn}: checking autoampel.de");
var yearMissingBeforeAutoampel = hsnGroup.Count(r => r.YearFrom is null || r.YearTo is null);
if (autoampelIndex is null)
{
try
{
autoampelIndex = await BuildAutoampelIndexByHsnTsnAsync(hsnTsnClient);
}
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] autoampel index build failed -> {ex.Message}");
autoampelIndex = new Dictionary<string, HsnTsnVehicle>(StringComparer.OrdinalIgnoreCase);
}
}
foreach (var row in hsnGroup)
{
if (autoampelIndex.TryGetValue(row.HsnTsn, out var sourceRow))
{
filledFromAutoampel += FillCoreFields(row, sourceRow, fillYears: true);
}
}
var matchedAutoampel = hsnGroup.Count(r => autoampelIndex.ContainsKey(r.HsnTsn));
var yearMissingAfterAutoampel = hsnGroup.Count(r => r.YearFrom is null || r.YearTo is null);
var yearFilledByAutoampel = Math.Max(0, yearMissingBeforeAutoampel - yearMissingAfterAutoampel);
Console.Error.WriteLine($"[info] [{hsnIndex}/{byHsn.Count}] HSN {hsn}: autoampel matched={matchedAutoampel}, year_filled={yearFilledByAutoampel}, still_missing={yearMissingAfterAutoampel}");
}
await using var outputStream = File.Create(outputCsvPath);
await using var outputWriter = new StreamWriter(outputStream);
await using var csvWriter = new CsvWriter(outputWriter, new CsvConfiguration(CultureInfo.InvariantCulture)
{
Delimiter = ";"
});
csvWriter.WriteHeader<CoreOutputRow>();
await csvWriter.NextRecordAsync();
foreach (var row in map.Values
.OrderBy(x => x.Hsn, StringComparer.OrdinalIgnoreCase)
.ThenBy(x => x.Tsn, StringComparer.OrdinalIgnoreCase))
{
csvWriter.WriteRecord(row);
await csvWriter.NextRecordAsync();
}
await csvWriter.FlushAsync();
Console.Error.WriteLine($"[info] Merge-core finished. rows={map.Count}, filled_hsntsn={filledFromHsnTsn}, filled_autoampel={filledFromAutoampel}, failed={failed}");
}
async Task RunMergeCoreDbMode(HsnTsnClient hsnTsnClient, string? inputPath, string? connectionStringArg, string tableName)
{
var inputCsvPath = string.IsNullOrWhiteSpace(inputPath) ? string.Empty : inputPath;
var connectionString = string.IsNullOrWhiteSpace(connectionStringArg)
? Environment.GetEnvironmentVariable("HSNTSN_PG")
: connectionStringArg;
if (string.IsNullOrWhiteSpace(connectionString))
{
Console.Error.WriteLine("[error] Missing PostgreSQL connection. Use --pg-connection or HSNTSN_PG env var.");
return;
}
var quotedTable = QuoteQualifiedTableName(tableName);
if (string.IsNullOrWhiteSpace(quotedTable))
{
Console.Error.WriteLine($"[error] Invalid --pg-table value: {tableName}");
return;
}
if (string.IsNullOrWhiteSpace(inputCsvPath))
{
Console.Error.WriteLine($"[info] Merge-core-db started. input=<none>, table={tableName}");
}
else
{
Console.Error.WriteLine($"[info] Merge-core-db started. input={inputCsvPath}, table={tableName}");
}
Dictionary<string, DbVehicleRow> map;
if (!string.IsNullOrWhiteSpace(inputCsvPath) && File.Exists(inputCsvPath))
{
map = await ReadDbRowsFromCsvAsync(inputCsvPath);
Console.Error.WriteLine($"[info] Seed source: CSV ({map.Count} unique HsnTsn)");
}
else
{
if (!string.IsNullOrWhiteSpace(inputCsvPath))
{
Console.Error.WriteLine($"[warn] Input CSV not found: {inputCsvPath}. Falling back to live scrape.");
}
map = await BuildSeedRowsFromHsnTsnAsync(hsnTsnClient);
Console.Error.WriteLine($"[info] Seed source: hsn-tsn.de live scrape ({map.Count} unique HsnTsn)");
}
if (map.Count == 0)
{
Console.Error.WriteLine("[warn] No seed data collected.");
return;
}
await using var conn = new NpgsqlConnection(connectionString);
await conn.OpenAsync();
await EnsureVehicleTableAsync(conn, quotedTable);
await BulkUpsertVehicleRowsAsync(conn, quotedTable, map.Values);
Console.Error.WriteLine($"[info] Seed upsert finished. rows={map.Count}");
var rowsToEnrich = await LoadVehicleRowsNeedingEnrichmentAsync(conn, quotedTable);
var byHsn = rowsToEnrich
.Where(x => !string.IsNullOrWhiteSpace(x.Hsn))
.GroupBy(x => x.Hsn, StringComparer.OrdinalIgnoreCase)
.OrderBy(x => x.Key, StringComparer.OrdinalIgnoreCase)
.ToList();
var hsnIndex = 0;
var filledFromHsnTsn = 0;
var failed = 0;
foreach (var hsnGroup in byHsn)
{
hsnIndex++;
var hsn = hsnGroup.Key;
Console.Error.WriteLine($"[info] [{hsnIndex}/{byHsn.Count}] HSN {hsn}: checking hsn-tsn.de");
var yearMissingBeforeHsnTsn = hsnGroup.Count(r => r.YearFrom is null || r.YearTo is null);
Dictionary<string, HsnTsnVehicle> fromHsnTsn;
try
{
fromHsnTsn = (await hsnTsnClient.GetVehiclesByHsnFromHsnTsnAsync(hsn))
.GroupBy(v => v.HsnTsn, StringComparer.OrdinalIgnoreCase)
.Select(g => g.First())
.ToDictionary(v => v.HsnTsn, v => v, StringComparer.OrdinalIgnoreCase);
}
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] HSN {hsn} failed on hsn-tsn.de -> {ex.Message}");
fromHsnTsn = new Dictionary<string, HsnTsnVehicle>(StringComparer.OrdinalIgnoreCase);
}
foreach (var row in hsnGroup)
{
if (fromHsnTsn.TryGetValue(row.HsnTsn, out var sourceRow))
{
filledFromHsnTsn += FillDbFields(row, sourceRow);
}
}
var matchedHsnTsn = hsnGroup.Count(r => fromHsnTsn.ContainsKey(r.HsnTsn));
var yearMissingAfterHsnTsn = hsnGroup.Count(r => r.YearFrom is null || r.YearTo is null);
var yearFilledByHsnTsn = Math.Max(0, yearMissingBeforeHsnTsn - yearMissingAfterHsnTsn);
Console.Error.WriteLine($"[info] [{hsnIndex}/{byHsn.Count}] HSN {hsn}: hsn-tsn matched={matchedHsnTsn}, year_filled={yearFilledByHsnTsn}, still_missing={yearMissingAfterHsnTsn}");
var needsAutoampel = hsnGroup.Any(NeedsAutoampelLookup);
if (!needsAutoampel)
{
continue;
}
}
foreach (var row in rowsToEnrich)
{
if (string.IsNullOrWhiteSpace(row.MatchKey))
{
row.MatchKey = BuildMatchKey(new HsnTsnVehicle
{
Brand = row.Brand,
VehicleType = row.VehicleType,
Model = row.Model,
OfficialType = row.OfficialType,
PowerKw = row.PowerKw,
DisplacementCcm = row.DisplacementCcm,
FuelType = row.FuelType
});
}
}
await BulkUpsertVehicleRowsAsync(conn, quotedTable, rowsToEnrich);
var autoampelUpserted = 0;
try
{
autoampelUpserted = await UpsertAutoampelPagesIntoDbAsync(hsnTsnClient, conn, quotedTable);
}
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] autoampel page-upsert failed -> {ex.Message}");
}
Console.Error.WriteLine($"[info] Merge-core-db finished. checked={rowsToEnrich.Count}, filled_hsntsn={filledFromHsnTsn}, autoampel_upserted={autoampelUpserted}, failed={failed}");
}
async Task<Dictionary<string, DbVehicleRow>> BuildSeedRowsFromHsnTsnAsync(HsnTsnClient hsnTsnClient)
{
var map = new Dictionary<string, DbVehicleRow>(StringComparer.OrdinalIgnoreCase);
IReadOnlyList<string> brandUrls;
try
{
brandUrls = await hsnTsnClient.GetBrandPageUrls();
}
catch (Exception ex)
{
Console.Error.WriteLine($"[error] Could not fetch brand urls for DB seed: {ex.Message}");
return map;
}
var pageIndex = 0;
foreach (var url in brandUrls)
{
pageIndex++;
Console.Error.WriteLine($"[info] [seed {pageIndex}/{brandUrls.Count}] {url}");
IReadOnlyList<HsnTsnVehicle> vehicles;
try
{
vehicles = await hsnTsnClient.GetVehiclesFromBrandPageAsync(url);
}
catch (Exception ex)
{
Console.Error.WriteLine($"[warn] Seed page failed: {url} -> {ex.Message}");
continue;
}
foreach (var v in vehicles)
{
if (!map.TryGetValue(v.HsnTsn, out var row))
{
row = new DbVehicleRow
{
HsnTsn = v.HsnTsn,
Hsn = v.Hsn,
Tsn = v.Tsn,
Brand = v.Brand,
VehicleType = v.VehicleType,
Model = v.Model,
OfficialType = v.OfficialType,
YearFrom = v.YearFrom,
YearTo = v.YearTo,
PowerPs = v.PowerPs,
PowerKw = v.PowerKw,
DisplacementCcm = v.DisplacementCcm,
FuelType = v.FuelType,
MatchKey = v.MatchKey
};
map[v.HsnTsn] = row;
continue;
}
if (string.IsNullOrWhiteSpace(row.Hsn) && !string.IsNullOrWhiteSpace(v.Hsn)) row.Hsn = v.Hsn;
if (string.IsNullOrWhiteSpace(row.Tsn) && !string.IsNullOrWhiteSpace(v.Tsn)) row.Tsn = v.Tsn;
if (string.IsNullOrWhiteSpace(row.Brand) && !string.IsNullOrWhiteSpace(v.Brand)) row.Brand = v.Brand;
if (string.IsNullOrWhiteSpace(row.VehicleType) && !string.IsNullOrWhiteSpace(v.VehicleType)) row.VehicleType = v.VehicleType;
if (string.IsNullOrWhiteSpace(row.Model) && !string.IsNullOrWhiteSpace(v.Model)) row.Model = v.Model;
if (string.IsNullOrWhiteSpace(row.OfficialType) && !string.IsNullOrWhiteSpace(v.OfficialType)) row.OfficialType = v.OfficialType;
if (row.YearFrom is null && v.YearFrom is not null) row.YearFrom = v.YearFrom;
if (row.YearTo is null && v.YearTo is not null) row.YearTo = v.YearTo;
if (row.PowerPs is null && v.PowerPs is not null) row.PowerPs = v.PowerPs;
if (row.PowerKw is null && v.PowerKw is not null) row.PowerKw = v.PowerKw;
if (row.DisplacementCcm is null && v.DisplacementCcm is not null) row.DisplacementCcm = v.DisplacementCcm;
if (string.IsNullOrWhiteSpace(row.FuelType) && !string.IsNullOrWhiteSpace(v.FuelType)) row.FuelType = v.FuelType;
if (string.IsNullOrWhiteSpace(row.MatchKey) && !string.IsNullOrWhiteSpace(v.MatchKey)) row.MatchKey = v.MatchKey;
}
}
return map;
}
async Task<Dictionary<string, HsnTsnVehicle>> BuildAutoampelIndexByHsnTsnAsync(HsnTsnClient hsnTsnClient)
{
var index = new Dictionary<string, HsnTsnVehicle>(StringComparer.OrdinalIgnoreCase);
var pageUrl = hsnTsnClient.GetAutoampelFullListUrl();
while (!string.IsNullOrWhiteSpace(pageUrl))
{
Console.Error.WriteLine($"[info] Processing: {pageUrl}");
var page = await hsnTsnClient.GetVehiclesFromAutoampelListPageAsync(pageUrl);
foreach (var vehicle in page.Vehicles)
{
if (!index.ContainsKey(vehicle.HsnTsn))
{
index[vehicle.HsnTsn] = vehicle;
}
}
pageUrl = page.NextPageUrl;
}
Console.Error.WriteLine($"[info] Autoampel index ready. unique={index.Count}");
return index;
}
async Task<int> UpsertAutoampelPagesIntoDbAsync(HsnTsnClient hsnTsnClient, NpgsqlConnection conn, string quotedTable)
{
var pageUrl = hsnTsnClient.GetAutoampelFullListUrl();
var pageIndex = 0;
var totalUpserted = 0;
while (!string.IsNullOrWhiteSpace(pageUrl))
{
pageIndex++;
Console.Error.WriteLine($"[info] Processing: {pageUrl}");
var page = await hsnTsnClient.GetVehiclesFromAutoampelListPageAsync(pageUrl);
var rows = page.Vehicles
.GroupBy(v => v.HsnTsn, StringComparer.OrdinalIgnoreCase)
.Select(g => g.First())
.Select(v => new DbVehicleRow
{
HsnTsn = v.HsnTsn,
Hsn = v.Hsn,
Tsn = v.Tsn,
Brand = v.Brand,
VehicleType = v.VehicleType,
Model = !string.IsNullOrWhiteSpace(v.Model) ? v.Model : DeriveModel(v),
OfficialType = v.OfficialType,
YearFrom = v.YearFrom,
YearTo = v.YearTo,
PowerPs = v.PowerPs,
PowerKw = v.PowerKw,
DisplacementCcm = v.DisplacementCcm,
FuelType = v.FuelType,
MatchKey = !string.IsNullOrWhiteSpace(v.MatchKey) ? v.MatchKey : BuildMatchKey(v)
})
.ToList();
if (rows.Count > 0)
{
await BulkUpsertVehicleRowsAsync(conn, quotedTable, rows);
totalUpserted += rows.Count;
}
Console.Error.WriteLine($"[info] Autoampel page {pageIndex} upserted={rows.Count}, total={totalUpserted}");
pageUrl = page.NextPageUrl;
}
return totalUpserted;
}
int FillCoreFields(CoreOutputRow target, HsnTsnVehicle source, bool fillYears)
{
var changes = 0;
if (string.IsNullOrWhiteSpace(target.Brand) && !string.IsNullOrWhiteSpace(source.Brand))
{
target.Brand = source.Brand;
changes++;
}
if (string.IsNullOrWhiteSpace(target.Model))
{
var model = DeriveModel(source);
if (!string.IsNullOrWhiteSpace(model))
{
target.Model = model;
changes++;
}
}
if (fillYears)
{
if (target.YearFrom is null && source.YearFrom is not null)
{
target.YearFrom = source.YearFrom;
changes++;
}
if (target.YearTo is null && source.YearTo is not null)
{
target.YearTo = source.YearTo;
changes++;
}
}
return changes;
}
string DeriveModel(HsnTsnVehicle vehicle)
{
if (!string.IsNullOrWhiteSpace(vehicle.Model))
{
return vehicle.Model;
}
var raw = vehicle.VehicleType?.Trim() ?? string.Empty;
if (string.IsNullOrWhiteSpace(raw))
{
return string.Empty;
}
var brand = vehicle.Brand?.Trim() ?? string.Empty;
if (!string.IsNullOrWhiteSpace(brand) && raw.StartsWith(brand + " ", StringComparison.OrdinalIgnoreCase))
{
return raw[(brand.Length + 1)..].Trim();
}
return raw;
}
static bool NeedsAutoampelLookup(DbVehicleRow row)
{
return string.IsNullOrWhiteSpace(row.Brand)
|| string.IsNullOrWhiteSpace(row.VehicleType)
|| string.IsNullOrWhiteSpace(row.Model)
|| row.YearFrom is null
|| row.YearTo is null
|| row.PowerPs is null
|| row.PowerKw is null
|| row.DisplacementCcm is null
|| string.IsNullOrWhiteSpace(row.FuelType)
|| string.IsNullOrWhiteSpace(row.MatchKey);
}
int FillDbFields(DbVehicleRow target, HsnTsnVehicle source)
{
var changes = 0;
if (string.IsNullOrWhiteSpace(target.Brand) && !string.IsNullOrWhiteSpace(source.Brand))
{
target.Brand = source.Brand;
changes++;
}
if (string.IsNullOrWhiteSpace(target.VehicleType) && !string.IsNullOrWhiteSpace(source.VehicleType))
{
target.VehicleType = source.VehicleType;
changes++;
}
if (string.IsNullOrWhiteSpace(target.Model))
{
var model = DeriveModel(source);
if (!string.IsNullOrWhiteSpace(model))
{
target.Model = model;
changes++;
}
}
if (string.IsNullOrWhiteSpace(target.OfficialType) && !string.IsNullOrWhiteSpace(source.OfficialType))
{
target.OfficialType = source.OfficialType;
changes++;
}
if (target.YearFrom is null && source.YearFrom is not null)
{
target.YearFrom = source.YearFrom;
changes++;
}
if (target.YearTo is null && source.YearTo is not null)
{
target.YearTo = source.YearTo;
changes++;
}
if (target.PowerPs is null && source.PowerPs is not null)
{
target.PowerPs = source.PowerPs;
changes++;
}
if (target.PowerKw is null && source.PowerKw is not null)
{
target.PowerKw = source.PowerKw;
changes++;
}
if (target.DisplacementCcm is null && source.DisplacementCcm is not null)
{
target.DisplacementCcm = source.DisplacementCcm;
changes++;
}
if (string.IsNullOrWhiteSpace(target.FuelType) && !string.IsNullOrWhiteSpace(source.FuelType))
{
target.FuelType = source.FuelType;
changes++;
}
if (string.IsNullOrWhiteSpace(target.MatchKey) && !string.IsNullOrWhiteSpace(source.MatchKey))
{
target.MatchKey = source.MatchKey;
changes++;
}
return changes;
}
async Task<Dictionary<string, DbVehicleRow>> ReadDbRowsFromCsvAsync(string inputCsvPath)
{
var map = new Dictionary<string, DbVehicleRow>(StringComparer.OrdinalIgnoreCase);
await using var inputStream = File.OpenRead(inputCsvPath);
using var inputReader = new StreamReader(inputStream);
using var csvReader = new CsvReader(inputReader, new CsvConfiguration(CultureInfo.InvariantCulture)
{
Delimiter = ";",
MissingFieldFound = null,
HeaderValidated = null
});
await csvReader.ReadAsync();
csvReader.ReadHeader();
await foreach (var record in csvReader.GetRecordsAsync<HsnTsnVehicle>())
{
var key = record.HsnTsn?.Trim();
if (string.IsNullOrWhiteSpace(key))
{
continue;
}
if (!map.TryGetValue(key, out var row))
{
row = new DbVehicleRow
{
HsnTsn = key,
Hsn = record.Hsn,
Tsn = record.Tsn,
Brand = record.Brand,
VehicleType = record.VehicleType,
Model = record.Model,
OfficialType = record.OfficialType,
YearFrom = record.YearFrom,
YearTo = record.YearTo,
PowerPs = record.PowerPs,
PowerKw = record.PowerKw,
DisplacementCcm = record.DisplacementCcm,
FuelType = record.FuelType,
MatchKey = record.MatchKey
};
map[key] = row;
}
else
{
if (string.IsNullOrWhiteSpace(row.Hsn) && !string.IsNullOrWhiteSpace(record.Hsn)) row.Hsn = record.Hsn;
if (string.IsNullOrWhiteSpace(row.Tsn) && !string.IsNullOrWhiteSpace(record.Tsn)) row.Tsn = record.Tsn;
if (string.IsNullOrWhiteSpace(row.Brand) && !string.IsNullOrWhiteSpace(record.Brand)) row.Brand = record.Brand;
if (string.IsNullOrWhiteSpace(row.VehicleType) && !string.IsNullOrWhiteSpace(record.VehicleType)) row.VehicleType = record.VehicleType;
if (string.IsNullOrWhiteSpace(row.Model) && !string.IsNullOrWhiteSpace(record.Model)) row.Model = record.Model;
if (string.IsNullOrWhiteSpace(row.OfficialType) && !string.IsNullOrWhiteSpace(record.OfficialType)) row.OfficialType = record.OfficialType;
if (row.YearFrom is null && record.YearFrom is not null) row.YearFrom = record.YearFrom;
if (row.YearTo is null && record.YearTo is not null) row.YearTo = record.YearTo;
if (row.PowerPs is null && record.PowerPs is not null) row.PowerPs = record.PowerPs;
if (row.PowerKw is null && record.PowerKw is not null) row.PowerKw = record.PowerKw;
if (row.DisplacementCcm is null && record.DisplacementCcm is not null) row.DisplacementCcm = record.DisplacementCcm;
if (string.IsNullOrWhiteSpace(row.FuelType) && !string.IsNullOrWhiteSpace(record.FuelType)) row.FuelType = record.FuelType;
if (string.IsNullOrWhiteSpace(row.MatchKey) && !string.IsNullOrWhiteSpace(record.MatchKey)) row.MatchKey = record.MatchKey;
}
}
return map;
}
static string QuoteQualifiedTableName(string raw)
{
if (string.IsNullOrWhiteSpace(raw))
{
return string.Empty;
}
var parts = raw.Split('.', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
if (parts.Length == 0 || parts.Length > 2)
{
return string.Empty;
}
foreach (var part in parts)
{
if (!Regex.IsMatch(part, @"^[A-Za-z_][A-Za-z0-9_]*$"))
{
return string.Empty;
}
}
return string.Join(".", parts.Select(p => $"\"{p}\""));
}
async Task EnsureVehicleTableAsync(NpgsqlConnection conn, string quotedTable)
{
var sql = $@"
CREATE TABLE IF NOT EXISTS {quotedTable} (
hsn_tsn text PRIMARY KEY,
hsn text NOT NULL,
tsn text NOT NULL,
brand text NULL,
vehicle_type text NULL,
model text NULL,
official_type text NULL,
year_to integer NULL,
year_from integer NULL,
power_ps integer NULL,
power_kw integer NULL,
displacement_ccm integer NULL,
fuel_type text NULL,
match_key text NULL
);";
await using var cmd = new NpgsqlCommand(sql, conn);
await cmd.ExecuteNonQueryAsync();
}
async Task BulkUpsertVehicleRowsAsync(NpgsqlConnection conn, string quotedTable, IEnumerable<DbVehicleRow> rows)
{
await using var tx = await conn.BeginTransactionAsync();
var sql = $@"
INSERT INTO {quotedTable} AS t (hsn_tsn, hsn, tsn, brand, vehicle_type, model, official_type, year_to, year_from, power_ps, power_kw, displacement_ccm, fuel_type, match_key)
VALUES (@hsn_tsn, @hsn, @tsn, @brand, @vehicle_type, @model, @official_type, @year_to, @year_from, @power_ps, @power_kw, @displacement_ccm, @fuel_type, @match_key)
ON CONFLICT (hsn_tsn) DO UPDATE
SET hsn = COALESCE(NULLIF(t.hsn, ''), EXCLUDED.hsn),
tsn = COALESCE(NULLIF(t.tsn, ''), EXCLUDED.tsn),
brand = COALESCE(NULLIF(t.brand, ''), EXCLUDED.brand),
vehicle_type = COALESCE(NULLIF(t.vehicle_type, ''), EXCLUDED.vehicle_type),
model = COALESCE(NULLIF(t.model, ''), EXCLUDED.model),
official_type = COALESCE(NULLIF(t.official_type, ''), EXCLUDED.official_type),
year_to = COALESCE(t.year_to, EXCLUDED.year_to),
year_from = COALESCE(t.year_from, EXCLUDED.year_from),
power_ps = COALESCE(t.power_ps, EXCLUDED.power_ps),
power_kw = COALESCE(t.power_kw, EXCLUDED.power_kw),
displacement_ccm = COALESCE(t.displacement_ccm, EXCLUDED.displacement_ccm),
fuel_type = COALESCE(NULLIF(t.fuel_type, ''), EXCLUDED.fuel_type),
match_key = COALESCE(NULLIF(t.match_key, ''), EXCLUDED.match_key);";
await using var cmd = new NpgsqlCommand(sql, conn, tx);
cmd.Parameters.Add(new NpgsqlParameter("hsn_tsn", NpgsqlTypes.NpgsqlDbType.Text));
cmd.Parameters.Add(new NpgsqlParameter("hsn", NpgsqlTypes.NpgsqlDbType.Text));
cmd.Parameters.Add(new NpgsqlParameter("tsn", NpgsqlTypes.NpgsqlDbType.Text));
cmd.Parameters.Add(new NpgsqlParameter("brand", NpgsqlTypes.NpgsqlDbType.Text));
cmd.Parameters.Add(new NpgsqlParameter("vehicle_type", NpgsqlTypes.NpgsqlDbType.Text));
cmd.Parameters.Add(new NpgsqlParameter("model", NpgsqlTypes.NpgsqlDbType.Text));
cmd.Parameters.Add(new NpgsqlParameter("official_type", NpgsqlTypes.NpgsqlDbType.Text));
cmd.Parameters.Add(new NpgsqlParameter("year_to", NpgsqlTypes.NpgsqlDbType.Integer));
cmd.Parameters.Add(new NpgsqlParameter("year_from", NpgsqlTypes.NpgsqlDbType.Integer));
cmd.Parameters.Add(new NpgsqlParameter("power_ps", NpgsqlTypes.NpgsqlDbType.Integer));
cmd.Parameters.Add(new NpgsqlParameter("power_kw", NpgsqlTypes.NpgsqlDbType.Integer));
cmd.Parameters.Add(new NpgsqlParameter("displacement_ccm", NpgsqlTypes.NpgsqlDbType.Integer));
cmd.Parameters.Add(new NpgsqlParameter("fuel_type", NpgsqlTypes.NpgsqlDbType.Text));
cmd.Parameters.Add(new NpgsqlParameter("match_key", NpgsqlTypes.NpgsqlDbType.Text));
foreach (var row in rows)
{
cmd.Parameters["hsn_tsn"].Value = row.HsnTsn;
cmd.Parameters["hsn"].Value = row.Hsn ?? string.Empty;
cmd.Parameters["tsn"].Value = row.Tsn ?? string.Empty;
cmd.Parameters["brand"].Value = string.IsNullOrWhiteSpace(row.Brand) ? DBNull.Value : row.Brand;
cmd.Parameters["vehicle_type"].Value = string.IsNullOrWhiteSpace(row.VehicleType) ? DBNull.Value : row.VehicleType;
cmd.Parameters["model"].Value = string.IsNullOrWhiteSpace(row.Model) ? DBNull.Value : row.Model;
cmd.Parameters["official_type"].Value = string.IsNullOrWhiteSpace(row.OfficialType) ? DBNull.Value : row.OfficialType;
cmd.Parameters["year_to"].Value = row.YearTo.HasValue ? row.YearTo.Value : DBNull.Value;
cmd.Parameters["year_from"].Value = row.YearFrom.HasValue ? row.YearFrom.Value : DBNull.Value;
cmd.Parameters["power_ps"].Value = row.PowerPs.HasValue ? row.PowerPs.Value : DBNull.Value;
cmd.Parameters["power_kw"].Value = row.PowerKw.HasValue ? row.PowerKw.Value : DBNull.Value;
cmd.Parameters["displacement_ccm"].Value = row.DisplacementCcm.HasValue ? row.DisplacementCcm.Value : DBNull.Value;
cmd.Parameters["fuel_type"].Value = string.IsNullOrWhiteSpace(row.FuelType) ? DBNull.Value : row.FuelType;
cmd.Parameters["match_key"].Value = string.IsNullOrWhiteSpace(row.MatchKey) ? DBNull.Value : row.MatchKey;
await cmd.ExecuteNonQueryAsync();
}
await tx.CommitAsync();
}
async Task<List<DbVehicleRow>> LoadVehicleRowsNeedingEnrichmentAsync(NpgsqlConnection conn, string quotedTable)
{
var sql = $@"
SELECT hsn_tsn, hsn, tsn, brand, vehicle_type, model, official_type, year_to, year_from, power_ps, power_kw, displacement_ccm, fuel_type, match_key
FROM {quotedTable}
WHERE hsn IS NOT NULL AND hsn <> ''
AND (
brand IS NULL OR brand = '' OR
vehicle_type IS NULL OR vehicle_type = '' OR
model IS NULL OR model = '' OR
power_ps IS NULL OR
power_kw IS NULL OR
displacement_ccm IS NULL OR
fuel_type IS NULL OR fuel_type = '' OR
match_key IS NULL OR match_key = '' OR
year_to IS NULL OR
year_from IS NULL
)
ORDER BY hsn, tsn;";
var list = new List<DbVehicleRow>();
await using var cmd = new NpgsqlCommand(sql, conn);
await using var reader = await cmd.ExecuteReaderAsync();
while (await reader.ReadAsync())
{
list.Add(new DbVehicleRow
{
HsnTsn = reader.GetString(0),
Hsn = reader.IsDBNull(1) ? string.Empty : reader.GetString(1),
Tsn = reader.IsDBNull(2) ? string.Empty : reader.GetString(2),
Brand = reader.IsDBNull(3) ? string.Empty : reader.GetString(3),
VehicleType = reader.IsDBNull(4) ? string.Empty : reader.GetString(4),
Model = reader.IsDBNull(5) ? string.Empty : reader.GetString(5),
OfficialType = reader.IsDBNull(6) ? string.Empty : reader.GetString(6),
YearTo = reader.IsDBNull(7) ? null : reader.GetInt32(7),
YearFrom = reader.IsDBNull(8) ? null : reader.GetInt32(8),
PowerPs = reader.IsDBNull(9) ? null : reader.GetInt32(9),
PowerKw = reader.IsDBNull(10) ? null : reader.GetInt32(10),
DisplacementCcm = reader.IsDBNull(11) ? null : reader.GetInt32(11),
FuelType = reader.IsDBNull(12) ? string.Empty : reader.GetString(12),
MatchKey = reader.IsDBNull(13) ? string.Empty : reader.GetString(13)
});
}
return list;
}
async Task RunRepairYearsMode(HsnTsnClient hsnTsnClient, string? inputPath, string? outputPath) async Task RunRepairYearsMode(HsnTsnClient hsnTsnClient, string? inputPath, string? outputPath)
{ {
var inputCsvPath = string.IsNullOrWhiteSpace(inputPath) ? "hsntsn.csv" : inputPath; var inputCsvPath = string.IsNullOrWhiteSpace(inputPath) ? "hsntsn.csv" : inputPath;
@@ -113,13 +957,19 @@ async Task RunRepairYearsMode(HsnTsnClient hsnTsnClient, string? inputPath, stri
Console.Error.WriteLine($"[info] Repair finished. {processed}/{totalRecords}, updated={updated}, failed={failed}"); Console.Error.WriteLine($"[info] Repair finished. {processed}/{totalRecords}, updated={updated}, failed={failed}");
} }
async Task RunScrapeMode(HsnTsnClient hsnTsnClient, bool includeDetailPages) async Task RunScrapeMode(HsnTsnClient hsnTsnClient, bool includeDetailPages, string scrapeSource)
{ {
var written = new HashSet<string>(StringComparer.OrdinalIgnoreCase); var written = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var processed = 0; var processed = 0;
var failed = 0; var failed = 0;
Console.Error.WriteLine($"[info] Scrape started. includeDetails={includeDetailPages}"); if (scrapeSource is not ("hsntsn" or "autoampel"))
{
Console.Error.WriteLine($"[error] Unknown --source value: {scrapeSource}. Use 'hsntsn' or 'autoampel'.");
return;
}
Console.Error.WriteLine($"[info] Scrape started. source={scrapeSource}, includeDetails={includeDetailPages}");
await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture) await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture)
{ {
@@ -130,7 +980,39 @@ async Task RunScrapeMode(HsnTsnClient hsnTsnClient, bool includeDetailPages)
await csvWriter.NextRecordAsync(); await csvWriter.NextRecordAsync();
await csvWriter.FlushAsync(); await csvWriter.FlushAsync();
if (Console.IsInputRedirected) if (scrapeSource == "autoampel")
{
if (Console.IsInputRedirected)
{
Console.Error.WriteLine("[error] --source autoampel does not support stdin query mode. Run without stdin redirection.");
return;
}
var pageUrl = hsnTsnClient.GetAutoampelFullListUrl();
while (!string.IsNullOrWhiteSpace(pageUrl))
{
Console.Error.WriteLine($"[info] Processing: {pageUrl}");
AutoampelPageResult pageResult;
try
{
pageResult = await hsnTsnClient.GetVehiclesFromAutoampelListPageAsync(pageUrl);
}
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] Autoampel page failed: {pageUrl} -> {ex.Message}");
break;
}
foreach (var vehicle in pageResult.Vehicles)
{
await WriteVehicleIfNew(vehicle);
}
pageUrl = pageResult.NextPageUrl;
}
}
else if (Console.IsInputRedirected)
{ {
await foreach (var query in ReadInput()) await foreach (var query in ReadInput())
{ {
@@ -337,3 +1219,32 @@ string BuildMatchKey(HsnTsnVehicle vehicle)
normalized = Regex.Replace(normalized, @"[^A-Z0-9]+", " ").Trim(); normalized = Regex.Replace(normalized, @"[^A-Z0-9]+", " ").Trim();
return Regex.Replace(normalized, @"\s+", " "); return Regex.Replace(normalized, @"\s+", " ");
} }
public sealed class CoreOutputRow
{
public string HsnTsn { get; set; } = string.Empty;
public string Hsn { get; set; } = string.Empty;
public string Tsn { get; set; } = string.Empty;
public string Brand { get; set; } = string.Empty;
public string Model { get; set; } = string.Empty;
public int? YearTo { get; set; }
public int? YearFrom { get; set; }
}
public sealed class DbVehicleRow
{
public string HsnTsn { get; set; } = string.Empty;
public string Hsn { get; set; } = string.Empty;
public string Tsn { get; set; } = string.Empty;
public string Brand { get; set; } = string.Empty;
public string VehicleType { get; set; } = string.Empty;
public string Model { get; set; } = string.Empty;
public string OfficialType { get; set; } = string.Empty;
public int? YearFrom { get; set; }
public int? YearTo { get; set; }
public int? PowerPs { get; set; }
public int? PowerKw { get; set; }
public int? DisplacementCcm { get; set; }
public string FuelType { get; set; } = string.Empty;
public string MatchKey { get; set; } = string.Empty;
}