Update vehicle scraping logic and enhance error handling; add CSV output flushing
This commit is contained in:
@@ -1,3 +1,4 @@
|
|||||||
bin/
|
bin/
|
||||||
obj/
|
obj/
|
||||||
.idea/
|
.idea/
|
||||||
|
hsntsn.csv
|
||||||
@@ -2,9 +2,9 @@
|
|||||||
|
|
||||||
.NET console scraper.
|
.NET console scraper.
|
||||||
|
|
||||||
Kaynak: `http://www.hsn-tsn.de/`
|
Source: `http://www.hsn-tsn.de/`
|
||||||
|
|
||||||
CSV cikti alanlari:
|
CSV output fields:
|
||||||
|
|
||||||
- `HsnTsn`, `Hsn`, `Tsn`
|
- `HsnTsn`, `Hsn`, `Tsn`
|
||||||
- `Brand`, `VehicleType`, `Model`, `OfficialType`
|
- `Brand`, `VehicleType`, `Model`, `OfficialType`
|
||||||
@@ -13,22 +13,22 @@ CSV cikti alanlari:
|
|||||||
- `MatchKey`
|
- `MatchKey`
|
||||||
- `SourceQuery`, `SourceListUrl`, `SourceDetailUrl`
|
- `SourceQuery`, `SourceListUrl`, `SourceDetailUrl`
|
||||||
|
|
||||||
## Calistirma
|
## Usage
|
||||||
|
|
||||||
Tum marka sayfalarini tara:
|
Scrape all brand pages:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv
|
dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv
|
||||||
```
|
```
|
||||||
|
|
||||||
Sadece verilen sorgulari tara (`stdin`):
|
Scrape only specific queries from `stdin`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
printf "0588\nGolf\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv
|
printf "0588\nGolf\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv
|
||||||
```
|
```
|
||||||
|
|
||||||
Detay sayfasi zenginlestirmesini kapat:
|
Enable detail-page enrichment:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
printf "0588\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --skip-details
|
printf "0588\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --include-details
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -3,39 +3,101 @@ using CsvHelper;
|
|||||||
using CsvHelper.Configuration;
|
using CsvHelper.Configuration;
|
||||||
using HsnTsnScraper;
|
using HsnTsnScraper;
|
||||||
|
|
||||||
var includeDetails = !args.Contains("--skip-details", StringComparer.OrdinalIgnoreCase);
|
var includeDetails = args.Contains("--include-details", StringComparer.OrdinalIgnoreCase);
|
||||||
|
|
||||||
using var client = new HsnTsnClient();
|
using var client = new HsnTsnClient();
|
||||||
var map = new Dictionary<string, HsnTsnVehicle>(StringComparer.OrdinalIgnoreCase);
|
var written = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||||
|
var processed = 0;
|
||||||
|
var failed = 0;
|
||||||
|
|
||||||
|
Console.Error.WriteLine($"[info] Scrape started. includeDetails={includeDetails}");
|
||||||
|
|
||||||
|
await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture)
|
||||||
|
{
|
||||||
|
Delimiter = ";"
|
||||||
|
});
|
||||||
|
|
||||||
|
csvWriter.WriteHeader<HsnTsnVehicle>();
|
||||||
|
await csvWriter.NextRecordAsync();
|
||||||
|
await csvWriter.FlushAsync();
|
||||||
|
|
||||||
if (Console.IsInputRedirected)
|
if (Console.IsInputRedirected)
|
||||||
{
|
{
|
||||||
await foreach (var query in ReadInput())
|
await foreach (var query in ReadInput())
|
||||||
{
|
{
|
||||||
var vehicles = await client.GetVehiclesFromSearchAsync(query);
|
IReadOnlyList<HsnTsnVehicle> vehicles;
|
||||||
Merge(map, vehicles);
|
try
|
||||||
|
{
|
||||||
|
vehicles = await client.GetVehiclesFromSearchAsync(query);
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
failed++;
|
||||||
|
Console.Error.WriteLine($"[warn] Query failed: {query} -> {ex.Message}");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach (var vehicle in vehicles)
|
||||||
|
{
|
||||||
|
await WriteVehicleIfNew(vehicle);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
var brandUrls = await client.GetBrandPageUrls();
|
IReadOnlyList<string> brandUrls;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
brandUrls = await client.GetBrandPageUrls();
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
Console.Error.WriteLine($"[error] Could not fetch brand urls: {ex.Message}");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
foreach (var url in brandUrls)
|
foreach (var url in brandUrls)
|
||||||
{
|
{
|
||||||
var vehicles = await client.GetVehiclesFromBrandPageAsync(url);
|
Console.Error.WriteLine($"[info] Processing: {url}");
|
||||||
Merge(map, vehicles);
|
IReadOnlyList<HsnTsnVehicle> vehicles;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
vehicles = await client.GetVehiclesFromBrandPageAsync(url);
|
||||||
}
|
}
|
||||||
}
|
catch (Exception ex)
|
||||||
|
|
||||||
if (includeDetails)
|
|
||||||
{
|
|
||||||
foreach (var vehicle in map.Values)
|
|
||||||
{
|
|
||||||
var detail = await client.GetVehicleDetailAsync(vehicle.SourceDetailUrl);
|
|
||||||
if (detail is null)
|
|
||||||
{
|
{
|
||||||
|
failed++;
|
||||||
|
Console.Error.WriteLine($"[warn] Brand page failed: {url} -> {ex.Message}");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
foreach (var vehicle in vehicles)
|
||||||
|
{
|
||||||
|
await WriteVehicleIfNew(vehicle);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
await csvWriter.FlushAsync();
|
||||||
|
Console.Error.WriteLine($"[info] Scrape finished. written={written.Count}, processed={processed}, failed={failed}");
|
||||||
|
return;
|
||||||
|
|
||||||
|
async Task WriteVehicleIfNew(HsnTsnVehicle vehicle)
|
||||||
|
{
|
||||||
|
processed++;
|
||||||
|
|
||||||
|
if (!written.Add(vehicle.HsnTsn))
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (includeDetails && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl))
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var detail = await client.GetVehicleDetailAsync(vehicle.SourceDetailUrl);
|
||||||
|
if (detail is not null)
|
||||||
|
{
|
||||||
if (!string.IsNullOrWhiteSpace(detail.Brand))
|
if (!string.IsNullOrWhiteSpace(detail.Brand))
|
||||||
{
|
{
|
||||||
vehicle.Brand = detail.Brand;
|
vehicle.Brand = detail.Brand;
|
||||||
@@ -51,81 +113,21 @@ if (includeDetails)
|
|||||||
vehicle.SourceDetailUrl = detail.CanonicalUrl;
|
vehicle.SourceDetailUrl = detail.CanonicalUrl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
failed++;
|
||||||
|
Console.Error.WriteLine($"[warn] Detail failed: {vehicle.HsnTsn} -> {ex.Message}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
foreach (var vehicle in map.Values)
|
|
||||||
{
|
|
||||||
vehicle.MatchKey = BuildMatchKey(vehicle);
|
vehicle.MatchKey = BuildMatchKey(vehicle);
|
||||||
}
|
|
||||||
|
|
||||||
await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture)
|
|
||||||
{
|
|
||||||
Delimiter = ";"
|
|
||||||
});
|
|
||||||
|
|
||||||
csvWriter.WriteHeader<HsnTsnVehicle>();
|
|
||||||
await csvWriter.NextRecordAsync();
|
|
||||||
|
|
||||||
foreach (var vehicle in map.Values.OrderBy(x => x.Hsn, StringComparer.OrdinalIgnoreCase).ThenBy(x => x.Tsn, StringComparer.OrdinalIgnoreCase))
|
|
||||||
{
|
|
||||||
csvWriter.WriteRecord(vehicle);
|
csvWriter.WriteRecord(vehicle);
|
||||||
await csvWriter.NextRecordAsync();
|
await csvWriter.NextRecordAsync();
|
||||||
}
|
|
||||||
|
|
||||||
await csvWriter.FlushAsync();
|
if (written.Count % 250 == 0)
|
||||||
|
|
||||||
return;
|
|
||||||
|
|
||||||
void Merge(IDictionary<string, HsnTsnVehicle> mapByHsnTsn, IEnumerable<HsnTsnVehicle> vehicles)
|
|
||||||
{
|
|
||||||
foreach (var vehicle in vehicles)
|
|
||||||
{
|
{
|
||||||
var key = vehicle.HsnTsn;
|
await csvWriter.FlushAsync();
|
||||||
if (!mapByHsnTsn.TryGetValue(key, out var existing))
|
|
||||||
{
|
|
||||||
mapByHsnTsn[key] = vehicle;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (string.IsNullOrWhiteSpace(existing.VehicleType) && !string.IsNullOrWhiteSpace(vehicle.VehicleType))
|
|
||||||
{
|
|
||||||
existing.VehicleType = vehicle.VehicleType;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (existing.PowerPs is null && vehicle.PowerPs is not null)
|
|
||||||
{
|
|
||||||
existing.PowerPs = vehicle.PowerPs;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (existing.PowerKw is null && vehicle.PowerKw is not null)
|
|
||||||
{
|
|
||||||
existing.PowerKw = vehicle.PowerKw;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (existing.DisplacementCcm is null && vehicle.DisplacementCcm is not null)
|
|
||||||
{
|
|
||||||
existing.DisplacementCcm = vehicle.DisplacementCcm;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (string.IsNullOrWhiteSpace(existing.FuelType) && !string.IsNullOrWhiteSpace(vehicle.FuelType))
|
|
||||||
{
|
|
||||||
existing.FuelType = vehicle.FuelType;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (string.IsNullOrWhiteSpace(existing.SourceQuery) && !string.IsNullOrWhiteSpace(vehicle.SourceQuery))
|
|
||||||
{
|
|
||||||
existing.SourceQuery = vehicle.SourceQuery;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (string.IsNullOrWhiteSpace(existing.SourceListUrl) && !string.IsNullOrWhiteSpace(vehicle.SourceListUrl))
|
|
||||||
{
|
|
||||||
existing.SourceListUrl = vehicle.SourceListUrl;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (string.IsNullOrWhiteSpace(existing.SourceDetailUrl) && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl))
|
|
||||||
{
|
|
||||||
existing.SourceDetailUrl = vehicle.SourceDetailUrl;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user