Update vehicle scraping logic and enhance error handling; add CSV output flushing

This commit is contained in:
2026-03-04 14:43:27 +03:00
parent fb2c2cdb8a
commit a10aabe4bf
3 changed files with 101 additions and 98 deletions
+1
View File
@@ -1,3 +1,4 @@
bin/ bin/
obj/ obj/
.idea/ .idea/
hsntsn.csv
+7 -7
View File
@@ -2,9 +2,9 @@
.NET console scraper. .NET console scraper.
Kaynak: `http://www.hsn-tsn.de/` Source: `http://www.hsn-tsn.de/`
CSV cikti alanlari: CSV output fields:
- `HsnTsn`, `Hsn`, `Tsn` - `HsnTsn`, `Hsn`, `Tsn`
- `Brand`, `VehicleType`, `Model`, `OfficialType` - `Brand`, `VehicleType`, `Model`, `OfficialType`
@@ -13,22 +13,22 @@ CSV cikti alanlari:
- `MatchKey` - `MatchKey`
- `SourceQuery`, `SourceListUrl`, `SourceDetailUrl` - `SourceQuery`, `SourceListUrl`, `SourceDetailUrl`
## Calistirma ## Usage
Tum marka sayfalarini tara: Scrape all brand pages:
```bash ```bash
dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv
``` ```
Sadece verilen sorgulari tara (`stdin`): Scrape only specific queries from `stdin`:
```bash ```bash
printf "0588\nGolf\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv printf "0588\nGolf\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv
``` ```
Detay sayfasi zenginlestirmesini kapat: Enable detail-page enrichment:
```bash ```bash
printf "0588\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --skip-details printf "0588\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --include-details
``` ```
+86 -84
View File
@@ -3,39 +3,101 @@ using CsvHelper;
using CsvHelper.Configuration; using CsvHelper.Configuration;
using HsnTsnScraper; using HsnTsnScraper;
var includeDetails = !args.Contains("--skip-details", StringComparer.OrdinalIgnoreCase); var includeDetails = args.Contains("--include-details", StringComparer.OrdinalIgnoreCase);
using var client = new HsnTsnClient(); using var client = new HsnTsnClient();
var map = new Dictionary<string, HsnTsnVehicle>(StringComparer.OrdinalIgnoreCase); var written = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var processed = 0;
var failed = 0;
Console.Error.WriteLine($"[info] Scrape started. includeDetails={includeDetails}");
await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture)
{
Delimiter = ";"
});
csvWriter.WriteHeader<HsnTsnVehicle>();
await csvWriter.NextRecordAsync();
await csvWriter.FlushAsync();
if (Console.IsInputRedirected) if (Console.IsInputRedirected)
{ {
await foreach (var query in ReadInput()) await foreach (var query in ReadInput())
{ {
var vehicles = await client.GetVehiclesFromSearchAsync(query); IReadOnlyList<HsnTsnVehicle> vehicles;
Merge(map, vehicles); try
{
vehicles = await client.GetVehiclesFromSearchAsync(query);
}
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] Query failed: {query} -> {ex.Message}");
continue;
}
foreach (var vehicle in vehicles)
{
await WriteVehicleIfNew(vehicle);
}
} }
} }
else else
{ {
var brandUrls = await client.GetBrandPageUrls(); IReadOnlyList<string> brandUrls;
try
{
brandUrls = await client.GetBrandPageUrls();
}
catch (Exception ex)
{
Console.Error.WriteLine($"[error] Could not fetch brand urls: {ex.Message}");
return;
}
foreach (var url in brandUrls) foreach (var url in brandUrls)
{ {
var vehicles = await client.GetVehiclesFromBrandPageAsync(url); Console.Error.WriteLine($"[info] Processing: {url}");
Merge(map, vehicles); IReadOnlyList<HsnTsnVehicle> vehicles;
try
{
vehicles = await client.GetVehiclesFromBrandPageAsync(url);
} }
} catch (Exception ex)
if (includeDetails)
{
foreach (var vehicle in map.Values)
{
var detail = await client.GetVehicleDetailAsync(vehicle.SourceDetailUrl);
if (detail is null)
{ {
failed++;
Console.Error.WriteLine($"[warn] Brand page failed: {url} -> {ex.Message}");
continue; continue;
} }
foreach (var vehicle in vehicles)
{
await WriteVehicleIfNew(vehicle);
}
}
}
await csvWriter.FlushAsync();
Console.Error.WriteLine($"[info] Scrape finished. written={written.Count}, processed={processed}, failed={failed}");
return;
async Task WriteVehicleIfNew(HsnTsnVehicle vehicle)
{
processed++;
if (!written.Add(vehicle.HsnTsn))
{
return;
}
if (includeDetails && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl))
{
try
{
var detail = await client.GetVehicleDetailAsync(vehicle.SourceDetailUrl);
if (detail is not null)
{
if (!string.IsNullOrWhiteSpace(detail.Brand)) if (!string.IsNullOrWhiteSpace(detail.Brand))
{ {
vehicle.Brand = detail.Brand; vehicle.Brand = detail.Brand;
@@ -51,81 +113,21 @@ if (includeDetails)
vehicle.SourceDetailUrl = detail.CanonicalUrl; vehicle.SourceDetailUrl = detail.CanonicalUrl;
} }
} }
} }
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] Detail failed: {vehicle.HsnTsn} -> {ex.Message}");
}
}
foreach (var vehicle in map.Values)
{
vehicle.MatchKey = BuildMatchKey(vehicle); vehicle.MatchKey = BuildMatchKey(vehicle);
}
await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture)
{
Delimiter = ";"
});
csvWriter.WriteHeader<HsnTsnVehicle>();
await csvWriter.NextRecordAsync();
foreach (var vehicle in map.Values.OrderBy(x => x.Hsn, StringComparer.OrdinalIgnoreCase).ThenBy(x => x.Tsn, StringComparer.OrdinalIgnoreCase))
{
csvWriter.WriteRecord(vehicle); csvWriter.WriteRecord(vehicle);
await csvWriter.NextRecordAsync(); await csvWriter.NextRecordAsync();
}
await csvWriter.FlushAsync(); if (written.Count % 250 == 0)
return;
void Merge(IDictionary<string, HsnTsnVehicle> mapByHsnTsn, IEnumerable<HsnTsnVehicle> vehicles)
{
foreach (var vehicle in vehicles)
{ {
var key = vehicle.HsnTsn; await csvWriter.FlushAsync();
if (!mapByHsnTsn.TryGetValue(key, out var existing))
{
mapByHsnTsn[key] = vehicle;
continue;
}
if (string.IsNullOrWhiteSpace(existing.VehicleType) && !string.IsNullOrWhiteSpace(vehicle.VehicleType))
{
existing.VehicleType = vehicle.VehicleType;
}
if (existing.PowerPs is null && vehicle.PowerPs is not null)
{
existing.PowerPs = vehicle.PowerPs;
}
if (existing.PowerKw is null && vehicle.PowerKw is not null)
{
existing.PowerKw = vehicle.PowerKw;
}
if (existing.DisplacementCcm is null && vehicle.DisplacementCcm is not null)
{
existing.DisplacementCcm = vehicle.DisplacementCcm;
}
if (string.IsNullOrWhiteSpace(existing.FuelType) && !string.IsNullOrWhiteSpace(vehicle.FuelType))
{
existing.FuelType = vehicle.FuelType;
}
if (string.IsNullOrWhiteSpace(existing.SourceQuery) && !string.IsNullOrWhiteSpace(vehicle.SourceQuery))
{
existing.SourceQuery = vehicle.SourceQuery;
}
if (string.IsNullOrWhiteSpace(existing.SourceListUrl) && !string.IsNullOrWhiteSpace(vehicle.SourceListUrl))
{
existing.SourceListUrl = vehicle.SourceListUrl;
}
if (string.IsNullOrWhiteSpace(existing.SourceDetailUrl) && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl))
{
existing.SourceDetailUrl = vehicle.SourceDetailUrl;
}
} }
} }