Update vehicle scraping logic and enhance error handling; add CSV output flushing

This commit is contained in:
2026-03-04 14:43:27 +03:00
parent fb2c2cdb8a
commit a10aabe4bf
3 changed files with 101 additions and 98 deletions
+1
View File
@@ -1,3 +1,4 @@
bin/
obj/
.idea/
hsntsn.csv
+7 -7
View File
@@ -2,9 +2,9 @@
.NET console scraper.
Kaynak: `http://www.hsn-tsn.de/`
Source: `http://www.hsn-tsn.de/`
CSV cikti alanlari:
CSV output fields:
- `HsnTsn`, `Hsn`, `Tsn`
- `Brand`, `VehicleType`, `Model`, `OfficialType`
@@ -13,22 +13,22 @@ CSV cikti alanlari:
- `MatchKey`
- `SourceQuery`, `SourceListUrl`, `SourceDetailUrl`
## Calistirma
## Usage
Tum marka sayfalarini tara:
Scrape all brand pages:
```bash
dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv
```
Sadece verilen sorgulari tara (`stdin`):
Scrape only specific queries from `stdin`:
```bash
printf "0588\nGolf\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv
```
Detay sayfasi zenginlestirmesini kapat:
Enable detail-page enrichment:
```bash
printf "0588\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --skip-details
printf "0588\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --include-details
```
+93 -91
View File
@@ -3,60 +3,14 @@ using CsvHelper;
using CsvHelper.Configuration;
using HsnTsnScraper;
var includeDetails = !args.Contains("--skip-details", StringComparer.OrdinalIgnoreCase);
var includeDetails = args.Contains("--include-details", StringComparer.OrdinalIgnoreCase);
using var client = new HsnTsnClient();
var map = new Dictionary<string, HsnTsnVehicle>(StringComparer.OrdinalIgnoreCase);
var written = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var processed = 0;
var failed = 0;
if (Console.IsInputRedirected)
{
await foreach (var query in ReadInput())
{
var vehicles = await client.GetVehiclesFromSearchAsync(query);
Merge(map, vehicles);
}
}
else
{
var brandUrls = await client.GetBrandPageUrls();
foreach (var url in brandUrls)
{
var vehicles = await client.GetVehiclesFromBrandPageAsync(url);
Merge(map, vehicles);
}
}
if (includeDetails)
{
foreach (var vehicle in map.Values)
{
var detail = await client.GetVehicleDetailAsync(vehicle.SourceDetailUrl);
if (detail is null)
{
continue;
}
if (!string.IsNullOrWhiteSpace(detail.Brand))
{
vehicle.Brand = detail.Brand;
}
vehicle.Model = detail.Model;
vehicle.OfficialType = detail.OfficialType;
vehicle.YearFrom = detail.YearFrom;
vehicle.YearTo = detail.YearTo;
if (!string.IsNullOrWhiteSpace(detail.CanonicalUrl))
{
vehicle.SourceDetailUrl = detail.CanonicalUrl;
}
}
}
foreach (var vehicle in map.Values)
{
vehicle.MatchKey = BuildMatchKey(vehicle);
}
Console.Error.WriteLine($"[info] Scrape started. includeDetails={includeDetails}");
await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture)
{
@@ -65,67 +19,115 @@ await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(Cult
csvWriter.WriteHeader<HsnTsnVehicle>();
await csvWriter.NextRecordAsync();
foreach (var vehicle in map.Values.OrderBy(x => x.Hsn, StringComparer.OrdinalIgnoreCase).ThenBy(x => x.Tsn, StringComparer.OrdinalIgnoreCase))
{
csvWriter.WriteRecord(vehicle);
await csvWriter.NextRecordAsync();
}
await csvWriter.FlushAsync();
return;
void Merge(IDictionary<string, HsnTsnVehicle> mapByHsnTsn, IEnumerable<HsnTsnVehicle> vehicles)
if (Console.IsInputRedirected)
{
foreach (var vehicle in vehicles)
await foreach (var query in ReadInput())
{
var key = vehicle.HsnTsn;
if (!mapByHsnTsn.TryGetValue(key, out var existing))
IReadOnlyList<HsnTsnVehicle> vehicles;
try
{
mapByHsnTsn[key] = vehicle;
vehicles = await client.GetVehiclesFromSearchAsync(query);
}
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] Query failed: {query} -> {ex.Message}");
continue;
}
if (string.IsNullOrWhiteSpace(existing.VehicleType) && !string.IsNullOrWhiteSpace(vehicle.VehicleType))
foreach (var vehicle in vehicles)
{
existing.VehicleType = vehicle.VehicleType;
await WriteVehicleIfNew(vehicle);
}
}
}
else
{
IReadOnlyList<string> brandUrls;
try
{
brandUrls = await client.GetBrandPageUrls();
}
catch (Exception ex)
{
Console.Error.WriteLine($"[error] Could not fetch brand urls: {ex.Message}");
return;
}
foreach (var url in brandUrls)
{
Console.Error.WriteLine($"[info] Processing: {url}");
IReadOnlyList<HsnTsnVehicle> vehicles;
try
{
vehicles = await client.GetVehiclesFromBrandPageAsync(url);
}
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] Brand page failed: {url} -> {ex.Message}");
continue;
}
if (existing.PowerPs is null && vehicle.PowerPs is not null)
foreach (var vehicle in vehicles)
{
existing.PowerPs = vehicle.PowerPs;
await WriteVehicleIfNew(vehicle);
}
}
}
if (existing.PowerKw is null && vehicle.PowerKw is not null)
{
existing.PowerKw = vehicle.PowerKw;
}
await csvWriter.FlushAsync();
Console.Error.WriteLine($"[info] Scrape finished. written={written.Count}, processed={processed}, failed={failed}");
return;
if (existing.DisplacementCcm is null && vehicle.DisplacementCcm is not null)
{
existing.DisplacementCcm = vehicle.DisplacementCcm;
}
async Task WriteVehicleIfNew(HsnTsnVehicle vehicle)
{
processed++;
if (string.IsNullOrWhiteSpace(existing.FuelType) && !string.IsNullOrWhiteSpace(vehicle.FuelType))
{
existing.FuelType = vehicle.FuelType;
}
if (!written.Add(vehicle.HsnTsn))
{
return;
}
if (string.IsNullOrWhiteSpace(existing.SourceQuery) && !string.IsNullOrWhiteSpace(vehicle.SourceQuery))
if (includeDetails && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl))
{
try
{
existing.SourceQuery = vehicle.SourceQuery;
}
var detail = await client.GetVehicleDetailAsync(vehicle.SourceDetailUrl);
if (detail is not null)
{
if (!string.IsNullOrWhiteSpace(detail.Brand))
{
vehicle.Brand = detail.Brand;
}
if (string.IsNullOrWhiteSpace(existing.SourceListUrl) && !string.IsNullOrWhiteSpace(vehicle.SourceListUrl))
{
existing.SourceListUrl = vehicle.SourceListUrl;
}
vehicle.Model = detail.Model;
vehicle.OfficialType = detail.OfficialType;
vehicle.YearFrom = detail.YearFrom;
vehicle.YearTo = detail.YearTo;
if (string.IsNullOrWhiteSpace(existing.SourceDetailUrl) && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl))
{
existing.SourceDetailUrl = vehicle.SourceDetailUrl;
if (!string.IsNullOrWhiteSpace(detail.CanonicalUrl))
{
vehicle.SourceDetailUrl = detail.CanonicalUrl;
}
}
}
catch (Exception ex)
{
failed++;
Console.Error.WriteLine($"[warn] Detail failed: {vehicle.HsnTsn} -> {ex.Message}");
}
}
vehicle.MatchKey = BuildMatchKey(vehicle);
csvWriter.WriteRecord(vehicle);
await csvWriter.NextRecordAsync();
if (written.Count % 250 == 0)
{
await csvWriter.FlushAsync();
}
}