diff --git a/.gitignore b/.gitignore index 0e945e0..384836e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ bin/ obj/ .idea/ +hsntsn.csv \ No newline at end of file diff --git a/README.md b/README.md index bae79b4..31d79e5 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,9 @@ .NET console scraper. -Kaynak: `http://www.hsn-tsn.de/` +Source: `http://www.hsn-tsn.de/` -CSV cikti alanlari: +CSV output fields: - `HsnTsn`, `Hsn`, `Tsn` - `Brand`, `VehicleType`, `Model`, `OfficialType` @@ -13,22 +13,22 @@ CSV cikti alanlari: - `MatchKey` - `SourceQuery`, `SourceListUrl`, `SourceDetailUrl` -## Calistirma +## Usage -Tum marka sayfalarini tara: +Scrape all brand pages: ```bash dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv ``` -Sadece verilen sorgulari tara (`stdin`): +Scrape only specific queries from `stdin`: ```bash printf "0588\nGolf\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv ``` -Detay sayfasi zenginlestirmesini kapat: +Enable detail-page enrichment: ```bash -printf "0588\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --skip-details +printf "0588\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --include-details ``` diff --git a/src/HsnTsnScraper/Program.cs b/src/HsnTsnScraper/Program.cs index 3a62c42..588c495 100644 --- a/src/HsnTsnScraper/Program.cs +++ b/src/HsnTsnScraper/Program.cs @@ -3,60 +3,14 @@ using CsvHelper; using CsvHelper.Configuration; using HsnTsnScraper; -var includeDetails = !args.Contains("--skip-details", StringComparer.OrdinalIgnoreCase); +var includeDetails = args.Contains("--include-details", StringComparer.OrdinalIgnoreCase); using var client = new HsnTsnClient(); -var map = new Dictionary(StringComparer.OrdinalIgnoreCase); +var written = new HashSet(StringComparer.OrdinalIgnoreCase); +var processed = 0; +var failed = 0; -if (Console.IsInputRedirected) -{ - await foreach (var query in ReadInput()) - { - var vehicles = await client.GetVehiclesFromSearchAsync(query); - Merge(map, vehicles); - } -} -else -{ - var brandUrls = await client.GetBrandPageUrls(); - foreach (var url in brandUrls) - { - var vehicles = await client.GetVehiclesFromBrandPageAsync(url); - Merge(map, vehicles); - } -} - -if (includeDetails) -{ - foreach (var vehicle in map.Values) - { - var detail = await client.GetVehicleDetailAsync(vehicle.SourceDetailUrl); - if (detail is null) - { - continue; - } - - if (!string.IsNullOrWhiteSpace(detail.Brand)) - { - vehicle.Brand = detail.Brand; - } - - vehicle.Model = detail.Model; - vehicle.OfficialType = detail.OfficialType; - vehicle.YearFrom = detail.YearFrom; - vehicle.YearTo = detail.YearTo; - - if (!string.IsNullOrWhiteSpace(detail.CanonicalUrl)) - { - vehicle.SourceDetailUrl = detail.CanonicalUrl; - } - } -} - -foreach (var vehicle in map.Values) -{ - vehicle.MatchKey = BuildMatchKey(vehicle); -} +Console.Error.WriteLine($"[info] Scrape started. includeDetails={includeDetails}"); await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture) { @@ -65,67 +19,115 @@ await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(Cult csvWriter.WriteHeader(); await csvWriter.NextRecordAsync(); - -foreach (var vehicle in map.Values.OrderBy(x => x.Hsn, StringComparer.OrdinalIgnoreCase).ThenBy(x => x.Tsn, StringComparer.OrdinalIgnoreCase)) -{ - csvWriter.WriteRecord(vehicle); - await csvWriter.NextRecordAsync(); -} - await csvWriter.FlushAsync(); -return; - -void Merge(IDictionary mapByHsnTsn, IEnumerable vehicles) +if (Console.IsInputRedirected) { - foreach (var vehicle in vehicles) + await foreach (var query in ReadInput()) { - var key = vehicle.HsnTsn; - if (!mapByHsnTsn.TryGetValue(key, out var existing)) + IReadOnlyList vehicles; + try { - mapByHsnTsn[key] = vehicle; + vehicles = await client.GetVehiclesFromSearchAsync(query); + } + catch (Exception ex) + { + failed++; + Console.Error.WriteLine($"[warn] Query failed: {query} -> {ex.Message}"); continue; } - if (string.IsNullOrWhiteSpace(existing.VehicleType) && !string.IsNullOrWhiteSpace(vehicle.VehicleType)) + foreach (var vehicle in vehicles) { - existing.VehicleType = vehicle.VehicleType; + await WriteVehicleIfNew(vehicle); + } + } +} +else +{ + IReadOnlyList brandUrls; + try + { + brandUrls = await client.GetBrandPageUrls(); + } + catch (Exception ex) + { + Console.Error.WriteLine($"[error] Could not fetch brand urls: {ex.Message}"); + return; + } + + foreach (var url in brandUrls) + { + Console.Error.WriteLine($"[info] Processing: {url}"); + IReadOnlyList vehicles; + try + { + vehicles = await client.GetVehiclesFromBrandPageAsync(url); + } + catch (Exception ex) + { + failed++; + Console.Error.WriteLine($"[warn] Brand page failed: {url} -> {ex.Message}"); + continue; } - if (existing.PowerPs is null && vehicle.PowerPs is not null) + foreach (var vehicle in vehicles) { - existing.PowerPs = vehicle.PowerPs; + await WriteVehicleIfNew(vehicle); } + } +} - if (existing.PowerKw is null && vehicle.PowerKw is not null) - { - existing.PowerKw = vehicle.PowerKw; - } +await csvWriter.FlushAsync(); +Console.Error.WriteLine($"[info] Scrape finished. written={written.Count}, processed={processed}, failed={failed}"); +return; - if (existing.DisplacementCcm is null && vehicle.DisplacementCcm is not null) - { - existing.DisplacementCcm = vehicle.DisplacementCcm; - } +async Task WriteVehicleIfNew(HsnTsnVehicle vehicle) +{ + processed++; - if (string.IsNullOrWhiteSpace(existing.FuelType) && !string.IsNullOrWhiteSpace(vehicle.FuelType)) - { - existing.FuelType = vehicle.FuelType; - } + if (!written.Add(vehicle.HsnTsn)) + { + return; + } - if (string.IsNullOrWhiteSpace(existing.SourceQuery) && !string.IsNullOrWhiteSpace(vehicle.SourceQuery)) + if (includeDetails && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl)) + { + try { - existing.SourceQuery = vehicle.SourceQuery; - } + var detail = await client.GetVehicleDetailAsync(vehicle.SourceDetailUrl); + if (detail is not null) + { + if (!string.IsNullOrWhiteSpace(detail.Brand)) + { + vehicle.Brand = detail.Brand; + } - if (string.IsNullOrWhiteSpace(existing.SourceListUrl) && !string.IsNullOrWhiteSpace(vehicle.SourceListUrl)) - { - existing.SourceListUrl = vehicle.SourceListUrl; - } + vehicle.Model = detail.Model; + vehicle.OfficialType = detail.OfficialType; + vehicle.YearFrom = detail.YearFrom; + vehicle.YearTo = detail.YearTo; - if (string.IsNullOrWhiteSpace(existing.SourceDetailUrl) && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl)) - { - existing.SourceDetailUrl = vehicle.SourceDetailUrl; + if (!string.IsNullOrWhiteSpace(detail.CanonicalUrl)) + { + vehicle.SourceDetailUrl = detail.CanonicalUrl; + } + } } + catch (Exception ex) + { + failed++; + Console.Error.WriteLine($"[warn] Detail failed: {vehicle.HsnTsn} -> {ex.Message}"); + } + } + + vehicle.MatchKey = BuildMatchKey(vehicle); + csvWriter.WriteRecord(vehicle); + await csvWriter.NextRecordAsync(); + + if (written.Count % 250 == 0) + { + await csvWriter.FlushAsync(); } }