Update vehicle scraping logic and enhance error handling; add CSV output flushing

This commit is contained in:
2026-03-04 14:43:27 +03:00
parent fb2c2cdb8a
commit a10aabe4bf
3 changed files with 101 additions and 98 deletions
+1
View File
@@ -1,3 +1,4 @@
bin/ bin/
obj/ obj/
.idea/ .idea/
hsntsn.csv
+7 -7
View File
@@ -2,9 +2,9 @@
.NET console scraper. .NET console scraper.
Kaynak: `http://www.hsn-tsn.de/` Source: `http://www.hsn-tsn.de/`
CSV cikti alanlari: CSV output fields:
- `HsnTsn`, `Hsn`, `Tsn` - `HsnTsn`, `Hsn`, `Tsn`
- `Brand`, `VehicleType`, `Model`, `OfficialType` - `Brand`, `VehicleType`, `Model`, `OfficialType`
@@ -13,22 +13,22 @@ CSV cikti alanlari:
- `MatchKey` - `MatchKey`
- `SourceQuery`, `SourceListUrl`, `SourceDetailUrl` - `SourceQuery`, `SourceListUrl`, `SourceDetailUrl`
## Calistirma ## Usage
Tum marka sayfalarini tara: Scrape all brand pages:
```bash ```bash
dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv
``` ```
Sadece verilen sorgulari tara (`stdin`): Scrape only specific queries from `stdin`:
```bash ```bash
printf "0588\nGolf\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv printf "0588\nGolf\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv
``` ```
Detay sayfasi zenginlestirmesini kapat: Enable detail-page enrichment:
```bash ```bash
printf "0588\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --skip-details printf "0588\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --include-details
``` ```
+93 -91
View File
@@ -3,60 +3,14 @@ using CsvHelper;
using CsvHelper.Configuration; using CsvHelper.Configuration;
using HsnTsnScraper; using HsnTsnScraper;
var includeDetails = !args.Contains("--skip-details", StringComparer.OrdinalIgnoreCase); var includeDetails = args.Contains("--include-details", StringComparer.OrdinalIgnoreCase);
using var client = new HsnTsnClient(); using var client = new HsnTsnClient();
var map = new Dictionary<string, HsnTsnVehicle>(StringComparer.OrdinalIgnoreCase); var written = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var processed = 0;
var failed = 0;
if (Console.IsInputRedirected) Console.Error.WriteLine($"[info] Scrape started. includeDetails={includeDetails}");
{
await foreach (var query in ReadInput())
{
var vehicles = await client.GetVehiclesFromSearchAsync(query);
Merge(map, vehicles);
}
}
else
{
var brandUrls = await client.GetBrandPageUrls();
foreach (var url in brandUrls)
{
var vehicles = await client.GetVehiclesFromBrandPageAsync(url);
Merge(map, vehicles);
}
}
if (includeDetails)
{
foreach (var vehicle in map.Values)
{
var detail = await client.GetVehicleDetailAsync(vehicle.SourceDetailUrl);
if (detail is null)
{
continue;
}
if (!string.IsNullOrWhiteSpace(detail.Brand))
{
vehicle.Brand = detail.Brand;
}
vehicle.Model = detail.Model;
vehicle.OfficialType = detail.OfficialType;
vehicle.YearFrom = detail.YearFrom;
vehicle.YearTo = detail.YearTo;
if (!string.IsNullOrWhiteSpace(detail.CanonicalUrl))
{
vehicle.SourceDetailUrl = detail.CanonicalUrl;
}
}
}
foreach (var vehicle in map.Values)
{
vehicle.MatchKey = BuildMatchKey(vehicle);
}
await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture) await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture)
{ {
@@ -65,67 +19,115 @@ await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(Cult
csvWriter.WriteHeader<HsnTsnVehicle>(); csvWriter.WriteHeader<HsnTsnVehicle>();
await csvWriter.NextRecordAsync(); await csvWriter.NextRecordAsync();
foreach (var vehicle in map.Values.OrderBy(x => x.Hsn, StringComparer.OrdinalIgnoreCase).ThenBy(x => x.Tsn, StringComparer.OrdinalIgnoreCase))
{
csvWriter.WriteRecord(vehicle);
await csvWriter.NextRecordAsync();
}
await csvWriter.FlushAsync(); await csvWriter.FlushAsync();
return; if (Console.IsInputRedirected)
void Merge(IDictionary<string, HsnTsnVehicle> mapByHsnTsn, IEnumerable<HsnTsnVehicle> vehicles)
{ {
foreach (var vehicle in vehicles) await foreach (var query in ReadInput())
{ {
var key = vehicle.HsnTsn; IReadOnlyList<HsnTsnVehicle> vehicles;
if (!mapByHsnTsn.TryGetValue(key, out var existing)) try
{ {
mapByHsnTsn[key] = vehicle; vehicles = await client.GetVehiclesFromSearchAsync(query);
}
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] Query failed: {query} -> {ex.Message}");
continue; continue;
} }
if (string.IsNullOrWhiteSpace(existing.VehicleType) && !string.IsNullOrWhiteSpace(vehicle.VehicleType)) foreach (var vehicle in vehicles)
{ {
existing.VehicleType = vehicle.VehicleType; await WriteVehicleIfNew(vehicle);
}
}
}
else
{
IReadOnlyList<string> brandUrls;
try
{
brandUrls = await client.GetBrandPageUrls();
}
catch (Exception ex)
{
Console.Error.WriteLine($"[error] Could not fetch brand urls: {ex.Message}");
return;
}
foreach (var url in brandUrls)
{
Console.Error.WriteLine($"[info] Processing: {url}");
IReadOnlyList<HsnTsnVehicle> vehicles;
try
{
vehicles = await client.GetVehiclesFromBrandPageAsync(url);
}
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] Brand page failed: {url} -> {ex.Message}");
continue;
} }
if (existing.PowerPs is null && vehicle.PowerPs is not null) foreach (var vehicle in vehicles)
{ {
existing.PowerPs = vehicle.PowerPs; await WriteVehicleIfNew(vehicle);
} }
}
}
if (existing.PowerKw is null && vehicle.PowerKw is not null) await csvWriter.FlushAsync();
{ Console.Error.WriteLine($"[info] Scrape finished. written={written.Count}, processed={processed}, failed={failed}");
existing.PowerKw = vehicle.PowerKw; return;
}
if (existing.DisplacementCcm is null && vehicle.DisplacementCcm is not null) async Task WriteVehicleIfNew(HsnTsnVehicle vehicle)
{ {
existing.DisplacementCcm = vehicle.DisplacementCcm; processed++;
}
if (string.IsNullOrWhiteSpace(existing.FuelType) && !string.IsNullOrWhiteSpace(vehicle.FuelType)) if (!written.Add(vehicle.HsnTsn))
{ {
existing.FuelType = vehicle.FuelType; return;
} }
if (string.IsNullOrWhiteSpace(existing.SourceQuery) && !string.IsNullOrWhiteSpace(vehicle.SourceQuery)) if (includeDetails && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl))
{
try
{ {
existing.SourceQuery = vehicle.SourceQuery; var detail = await client.GetVehicleDetailAsync(vehicle.SourceDetailUrl);
} if (detail is not null)
{
if (!string.IsNullOrWhiteSpace(detail.Brand))
{
vehicle.Brand = detail.Brand;
}
if (string.IsNullOrWhiteSpace(existing.SourceListUrl) && !string.IsNullOrWhiteSpace(vehicle.SourceListUrl)) vehicle.Model = detail.Model;
{ vehicle.OfficialType = detail.OfficialType;
existing.SourceListUrl = vehicle.SourceListUrl; vehicle.YearFrom = detail.YearFrom;
} vehicle.YearTo = detail.YearTo;
if (string.IsNullOrWhiteSpace(existing.SourceDetailUrl) && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl)) if (!string.IsNullOrWhiteSpace(detail.CanonicalUrl))
{ {
existing.SourceDetailUrl = vehicle.SourceDetailUrl; vehicle.SourceDetailUrl = detail.CanonicalUrl;
}
}
} }
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] Detail failed: {vehicle.HsnTsn} -> {ex.Message}");
}
}
vehicle.MatchKey = BuildMatchKey(vehicle);
csvWriter.WriteRecord(vehicle);
await csvWriter.NextRecordAsync();
if (written.Count % 250 == 0)
{
await csvWriter.FlushAsync();
} }
} }