Update vehicle scraping logic and enhance error handling; add CSV output flushing
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
bin/
|
||||
obj/
|
||||
.idea/
|
||||
hsntsn.csv
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
.NET console scraper.
|
||||
|
||||
Kaynak: `http://www.hsn-tsn.de/`
|
||||
Source: `http://www.hsn-tsn.de/`
|
||||
|
||||
CSV cikti alanlari:
|
||||
CSV output fields:
|
||||
|
||||
- `HsnTsn`, `Hsn`, `Tsn`
|
||||
- `Brand`, `VehicleType`, `Model`, `OfficialType`
|
||||
@@ -13,22 +13,22 @@ CSV cikti alanlari:
|
||||
- `MatchKey`
|
||||
- `SourceQuery`, `SourceListUrl`, `SourceDetailUrl`
|
||||
|
||||
## Calistirma
|
||||
## Usage
|
||||
|
||||
Tum marka sayfalarini tara:
|
||||
Scrape all brand pages:
|
||||
|
||||
```bash
|
||||
dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv
|
||||
```
|
||||
|
||||
Sadece verilen sorgulari tara (`stdin`):
|
||||
Scrape only specific queries from `stdin`:
|
||||
|
||||
```bash
|
||||
printf "0588\nGolf\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv
|
||||
```
|
||||
|
||||
Detay sayfasi zenginlestirmesini kapat:
|
||||
Enable detail-page enrichment:
|
||||
|
||||
```bash
|
||||
printf "0588\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --skip-details
|
||||
printf "0588\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --include-details
|
||||
```
|
||||
|
||||
@@ -3,60 +3,14 @@ using CsvHelper;
|
||||
using CsvHelper.Configuration;
|
||||
using HsnTsnScraper;
|
||||
|
||||
var includeDetails = !args.Contains("--skip-details", StringComparer.OrdinalIgnoreCase);
|
||||
var includeDetails = args.Contains("--include-details", StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
using var client = new HsnTsnClient();
|
||||
var map = new Dictionary<string, HsnTsnVehicle>(StringComparer.OrdinalIgnoreCase);
|
||||
var written = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
var processed = 0;
|
||||
var failed = 0;
|
||||
|
||||
if (Console.IsInputRedirected)
|
||||
{
|
||||
await foreach (var query in ReadInput())
|
||||
{
|
||||
var vehicles = await client.GetVehiclesFromSearchAsync(query);
|
||||
Merge(map, vehicles);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
var brandUrls = await client.GetBrandPageUrls();
|
||||
foreach (var url in brandUrls)
|
||||
{
|
||||
var vehicles = await client.GetVehiclesFromBrandPageAsync(url);
|
||||
Merge(map, vehicles);
|
||||
}
|
||||
}
|
||||
|
||||
if (includeDetails)
|
||||
{
|
||||
foreach (var vehicle in map.Values)
|
||||
{
|
||||
var detail = await client.GetVehicleDetailAsync(vehicle.SourceDetailUrl);
|
||||
if (detail is null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(detail.Brand))
|
||||
{
|
||||
vehicle.Brand = detail.Brand;
|
||||
}
|
||||
|
||||
vehicle.Model = detail.Model;
|
||||
vehicle.OfficialType = detail.OfficialType;
|
||||
vehicle.YearFrom = detail.YearFrom;
|
||||
vehicle.YearTo = detail.YearTo;
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(detail.CanonicalUrl))
|
||||
{
|
||||
vehicle.SourceDetailUrl = detail.CanonicalUrl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
foreach (var vehicle in map.Values)
|
||||
{
|
||||
vehicle.MatchKey = BuildMatchKey(vehicle);
|
||||
}
|
||||
Console.Error.WriteLine($"[info] Scrape started. includeDetails={includeDetails}");
|
||||
|
||||
await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture)
|
||||
{
|
||||
@@ -65,67 +19,115 @@ await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(Cult
|
||||
|
||||
csvWriter.WriteHeader<HsnTsnVehicle>();
|
||||
await csvWriter.NextRecordAsync();
|
||||
|
||||
foreach (var vehicle in map.Values.OrderBy(x => x.Hsn, StringComparer.OrdinalIgnoreCase).ThenBy(x => x.Tsn, StringComparer.OrdinalIgnoreCase))
|
||||
{
|
||||
csvWriter.WriteRecord(vehicle);
|
||||
await csvWriter.NextRecordAsync();
|
||||
}
|
||||
|
||||
await csvWriter.FlushAsync();
|
||||
|
||||
return;
|
||||
|
||||
void Merge(IDictionary<string, HsnTsnVehicle> mapByHsnTsn, IEnumerable<HsnTsnVehicle> vehicles)
|
||||
if (Console.IsInputRedirected)
|
||||
{
|
||||
foreach (var vehicle in vehicles)
|
||||
await foreach (var query in ReadInput())
|
||||
{
|
||||
var key = vehicle.HsnTsn;
|
||||
if (!mapByHsnTsn.TryGetValue(key, out var existing))
|
||||
IReadOnlyList<HsnTsnVehicle> vehicles;
|
||||
try
|
||||
{
|
||||
mapByHsnTsn[key] = vehicle;
|
||||
vehicles = await client.GetVehiclesFromSearchAsync(query);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
failed++;
|
||||
Console.Error.WriteLine($"[warn] Query failed: {query} -> {ex.Message}");
|
||||
continue;
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(existing.VehicleType) && !string.IsNullOrWhiteSpace(vehicle.VehicleType))
|
||||
foreach (var vehicle in vehicles)
|
||||
{
|
||||
existing.VehicleType = vehicle.VehicleType;
|
||||
await WriteVehicleIfNew(vehicle);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
IReadOnlyList<string> brandUrls;
|
||||
try
|
||||
{
|
||||
brandUrls = await client.GetBrandPageUrls();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Console.Error.WriteLine($"[error] Could not fetch brand urls: {ex.Message}");
|
||||
return;
|
||||
}
|
||||
|
||||
foreach (var url in brandUrls)
|
||||
{
|
||||
Console.Error.WriteLine($"[info] Processing: {url}");
|
||||
IReadOnlyList<HsnTsnVehicle> vehicles;
|
||||
try
|
||||
{
|
||||
vehicles = await client.GetVehiclesFromBrandPageAsync(url);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
failed++;
|
||||
Console.Error.WriteLine($"[warn] Brand page failed: {url} -> {ex.Message}");
|
||||
continue;
|
||||
}
|
||||
|
||||
if (existing.PowerPs is null && vehicle.PowerPs is not null)
|
||||
foreach (var vehicle in vehicles)
|
||||
{
|
||||
existing.PowerPs = vehicle.PowerPs;
|
||||
await WriteVehicleIfNew(vehicle);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (existing.PowerKw is null && vehicle.PowerKw is not null)
|
||||
{
|
||||
existing.PowerKw = vehicle.PowerKw;
|
||||
}
|
||||
await csvWriter.FlushAsync();
|
||||
Console.Error.WriteLine($"[info] Scrape finished. written={written.Count}, processed={processed}, failed={failed}");
|
||||
return;
|
||||
|
||||
if (existing.DisplacementCcm is null && vehicle.DisplacementCcm is not null)
|
||||
{
|
||||
existing.DisplacementCcm = vehicle.DisplacementCcm;
|
||||
}
|
||||
async Task WriteVehicleIfNew(HsnTsnVehicle vehicle)
|
||||
{
|
||||
processed++;
|
||||
|
||||
if (string.IsNullOrWhiteSpace(existing.FuelType) && !string.IsNullOrWhiteSpace(vehicle.FuelType))
|
||||
{
|
||||
existing.FuelType = vehicle.FuelType;
|
||||
}
|
||||
if (!written.Add(vehicle.HsnTsn))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(existing.SourceQuery) && !string.IsNullOrWhiteSpace(vehicle.SourceQuery))
|
||||
if (includeDetails && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl))
|
||||
{
|
||||
try
|
||||
{
|
||||
existing.SourceQuery = vehicle.SourceQuery;
|
||||
}
|
||||
var detail = await client.GetVehicleDetailAsync(vehicle.SourceDetailUrl);
|
||||
if (detail is not null)
|
||||
{
|
||||
if (!string.IsNullOrWhiteSpace(detail.Brand))
|
||||
{
|
||||
vehicle.Brand = detail.Brand;
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(existing.SourceListUrl) && !string.IsNullOrWhiteSpace(vehicle.SourceListUrl))
|
||||
{
|
||||
existing.SourceListUrl = vehicle.SourceListUrl;
|
||||
}
|
||||
vehicle.Model = detail.Model;
|
||||
vehicle.OfficialType = detail.OfficialType;
|
||||
vehicle.YearFrom = detail.YearFrom;
|
||||
vehicle.YearTo = detail.YearTo;
|
||||
|
||||
if (string.IsNullOrWhiteSpace(existing.SourceDetailUrl) && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl))
|
||||
{
|
||||
existing.SourceDetailUrl = vehicle.SourceDetailUrl;
|
||||
if (!string.IsNullOrWhiteSpace(detail.CanonicalUrl))
|
||||
{
|
||||
vehicle.SourceDetailUrl = detail.CanonicalUrl;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
failed++;
|
||||
Console.Error.WriteLine($"[warn] Detail failed: {vehicle.HsnTsn} -> {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
vehicle.MatchKey = BuildMatchKey(vehicle);
|
||||
csvWriter.WriteRecord(vehicle);
|
||||
await csvWriter.NextRecordAsync();
|
||||
|
||||
if (written.Count % 250 == 0)
|
||||
{
|
||||
await csvWriter.FlushAsync();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user