commit fb2c2cdb8ac7c774aca4d6d6586a9e20a115aef6 Author: akinayturan Date: Wed Mar 4 14:36:40 2026 +0300 Add HsnTsnClient and related classes for vehicle data scraping - Implement HsnTsnClient to fetch vehicle data from hsn-tsn.de - Create VehicleDetail and HsnTsnVehicle classes for data representation - Add CSV output functionality in Program.cs - Include necessary NuGet packages in project file - Add README with usage instructions diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0e945e0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +bin/ +obj/ +.idea/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..bae79b4 --- /dev/null +++ b/README.md @@ -0,0 +1,34 @@ +# hsntsn-scraper + +.NET console scraper. + +Kaynak: `http://www.hsn-tsn.de/` + +CSV cikti alanlari: + +- `HsnTsn`, `Hsn`, `Tsn` +- `Brand`, `VehicleType`, `Model`, `OfficialType` +- `YearFrom`, `YearTo` +- `PowerPs`, `PowerKw`, `DisplacementCcm`, `FuelType` +- `MatchKey` +- `SourceQuery`, `SourceListUrl`, `SourceDetailUrl` + +## Calistirma + +Tum marka sayfalarini tara: + +```bash +dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv +``` + +Sadece verilen sorgulari tara (`stdin`): + +```bash +printf "0588\nGolf\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv +``` + +Detay sayfasi zenginlestirmesini kapat: + +```bash +printf "0588\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --skip-details +``` diff --git a/hsntsn-scraper.sln b/hsntsn-scraper.sln new file mode 100644 index 0000000..6df5324 --- /dev/null +++ b/hsntsn-scraper.sln @@ -0,0 +1,39 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.0.31903.59 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{827E0CD3-B72D-47B6-A68D-7590B98EB39B}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HsnTsnScraper", "src\HsnTsnScraper\HsnTsnScraper.csproj", "{F5352DDD-D3DE-4550-A50B-B28C00AA6785}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Debug|x64 = Debug|x64 + Debug|x86 = Debug|x86 + Release|Any CPU = Release|Any CPU + Release|x64 = Release|x64 + Release|x86 = Release|x86 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Debug|Any CPU.Build.0 = Debug|Any CPU + {F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Debug|x64.ActiveCfg = Debug|Any CPU + {F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Debug|x64.Build.0 = Debug|Any CPU + {F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Debug|x86.ActiveCfg = Debug|Any CPU + {F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Debug|x86.Build.0 = Debug|Any CPU + {F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Release|Any CPU.ActiveCfg = Release|Any CPU + {F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Release|Any CPU.Build.0 = Release|Any CPU + {F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Release|x64.ActiveCfg = Release|Any CPU + {F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Release|x64.Build.0 = Release|Any CPU + {F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Release|x86.ActiveCfg = Release|Any CPU + {F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Release|x86.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(NestedProjects) = preSolution + {F5352DDD-D3DE-4550-A50B-B28C00AA6785} = {827E0CD3-B72D-47B6-A68D-7590B98EB39B} + EndGlobalSection +EndGlobal diff --git a/src/HsnTsnScraper/HsnTsnClient.cs b/src/HsnTsnScraper/HsnTsnClient.cs new file mode 100644 index 0000000..a363924 --- /dev/null +++ b/src/HsnTsnScraper/HsnTsnClient.cs @@ -0,0 +1,361 @@ +using System.Net; +using System.Text.RegularExpressions; +using HtmlAgilityPack; +using Microsoft.Extensions.Http; +using Polly; +using Polly.Extensions.Http; + +namespace HsnTsnScraper; + +public sealed class HsnTsnClient : IDisposable +{ + private static readonly Regex HsnTsnRegex = new(@"(?\d{4})\s*/\s*(?[A-Z0-9]{3})", RegexOptions.Compiled); + private static readonly Regex PsKwRegex = new(@"(?\d+)\s*PS\s*\((?\d+)\s*kW\)", RegexOptions.Compiled | RegexOptions.IgnoreCase); + private static readonly Regex CcmRegex = new(@"(?[\d\.\,]+)\s*ccm", RegexOptions.Compiled | RegexOptions.IgnoreCase); + private static readonly Regex YearRangeRegex = new(@"(?\d{4})(?:\D+(?\d{4}))?", RegexOptions.Compiled); + private static readonly HashSet ExcludedLinks = new(StringComparer.OrdinalIgnoreCase) + { + "/impressum.php", + "/datenschutz.php", + "/datenschutz" + }; + + private readonly HttpClient _client; + + public HsnTsnClient(string baseUrl = "http://www.hsn-tsn.de/") + { + var retryPolicy = HttpPolicyExtensions + .HandleTransientHttpError() + .OrResult(x => x.StatusCode == HttpStatusCode.TooManyRequests) + .WaitAndRetryAsync(3, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt))); + + var socketHandler = new SocketsHttpHandler + { + PooledConnectionLifetime = TimeSpan.FromMinutes(5), + UseCookies = false + }; + + var pollyHandler = new PolicyHttpMessageHandler(retryPolicy) + { + InnerHandler = socketHandler + }; + + _client = new HttpClient(pollyHandler) + { + BaseAddress = new Uri(baseUrl) + }; + + _client.DefaultRequestHeaders.UserAgent.ParseAdd("Mozilla/5.0 (compatible; hsntsn-scraper/1.0)"); + } + + public async Task> GetBrandPageUrls(CancellationToken cancellationToken = default) + { + var homeHtml = await GetStringAsync("/", cancellationToken); + var doc = LoadDocument(homeHtml); + var links = doc.DocumentNode.SelectNodes("//a[@href]") ?? new HtmlNodeCollection(null); + + var result = new HashSet(StringComparer.OrdinalIgnoreCase); + foreach (var link in links) + { + var href = link.GetAttributeValue("href", string.Empty).Trim(); + if (string.IsNullOrWhiteSpace(href)) + { + continue; + } + + if (!href.EndsWith(".html", StringComparison.OrdinalIgnoreCase)) + { + continue; + } + + if (ExcludedLinks.Contains(href)) + { + continue; + } + + if (href.StartsWith("http://", StringComparison.OrdinalIgnoreCase) || + href.StartsWith("https://", StringComparison.OrdinalIgnoreCase)) + { + result.Add(href); + } + else + { + result.Add(new Uri(_client.BaseAddress!, href).ToString()); + } + } + + return result.OrderBy(x => x, StringComparer.OrdinalIgnoreCase).ToArray(); + } + + public async Task> GetVehiclesFromSearchAsync(string query, CancellationToken cancellationToken = default) + { + var encoded = Uri.EscapeDataString(query); + var url = $"/liste.php?string={encoded}"; + var html = await GetStringAsync(url, cancellationToken); + return ParseVehiclesFromListPage(html, new Uri(_client.BaseAddress!, url).ToString(), query); + } + + public async Task> GetVehiclesFromBrandPageAsync(string absoluteUrl, CancellationToken cancellationToken = default) + { + var html = await GetStringAsync(absoluteUrl, cancellationToken); + return ParseVehiclesFromListPage(html, absoluteUrl, null); + } + + public async Task GetVehicleDetailAsync(string detailUrl, CancellationToken cancellationToken = default) + { + if (string.IsNullOrWhiteSpace(detailUrl)) + { + return null; + } + + var html = await GetStringAsync(detailUrl, cancellationToken); + var doc = LoadDocument(html); + + var carNode = doc.DocumentNode.SelectSingleNode("//div[@id='car']"); + if (carNode is null) + { + return null; + } + + var detail = new VehicleDetail + { + Hsn = carNode.GetAttributeValue("data-hsn", string.Empty).Trim(), + Tsn = carNode.GetAttributeValue("data-tsn", string.Empty).Trim(), + Brand = HtmlEntity.DeEntitize(doc.DocumentNode.SelectSingleNode("//span[@property='brand']//span[@property='name']")?.InnerText ?? string.Empty).Trim(), + Model = HtmlEntity.DeEntitize(doc.DocumentNode.SelectSingleNode("//span[@property='model']")?.InnerText ?? string.Empty).Trim(), + OfficialType = HtmlEntity.DeEntitize(doc.DocumentNode.SelectSingleNode("//span[@property='alternateName']")?.InnerText ?? string.Empty).Trim(), + CanonicalUrl = doc.DocumentNode.SelectSingleNode("//meta[@property='url']")?.GetAttributeValue("content", string.Empty).Trim() ?? string.Empty + }; + + var yearText = HtmlEntity.DeEntitize(doc.DocumentNode.SelectSingleNode("//small[@property='vehicleModelDate']")?.InnerText ?? string.Empty).Trim(); + ParseYearRange(yearText, out var fromYear, out var toYear); + detail.YearFrom = fromYear; + detail.YearTo = toYear; + + return detail; + } + + public void Dispose() + { + _client.Dispose(); + } + + private async Task GetStringAsync(string relativeOrAbsoluteUrl, CancellationToken cancellationToken) + { + using var response = await _client.GetAsync(relativeOrAbsoluteUrl, cancellationToken); + response.EnsureSuccessStatusCode(); + return await response.Content.ReadAsStringAsync(cancellationToken); + } + + private IReadOnlyList ParseVehiclesFromListPage(string html, string sourceListUrl, string? sourceQuery) + { + var doc = LoadDocument(html); + var rows = doc.DocumentNode.SelectNodes("//table//tr") ?? new HtmlNodeCollection(null); + var pageHeading = Clean(doc.DocumentNode.SelectSingleNode("//h1")?.InnerText ?? string.Empty); + var pageBrand = IsLikelyBrandHeading(pageHeading) ? pageHeading : string.Empty; + var result = new List(); + + foreach (var row in rows) + { + var cells = row.SelectNodes("./td"); + if (cells is null || cells.Count < 5) + { + continue; + } + + var hsnTsnText = Clean(cells[0].InnerText); + var match = HsnTsnRegex.Match(hsnTsnText); + if (!match.Success) + { + continue; + } + + var vehicleTypeRaw = Clean(cells[1].InnerText); + var detailHref = cells[1].SelectSingleNode(".//a[@href]")?.GetAttributeValue("href", string.Empty) ?? string.Empty; + var detailUrl = ToAbsoluteUrl(detailHref); + + var powerText = Clean(cells[2].InnerText); + ParsePower(powerText, out var ps, out var kw); + + var displacementText = Clean(cells[3].InnerText); + var displacementCcm = ParseDisplacement(displacementText); + + var fuelType = Clean(cells[4].InnerText); + var brand = !string.IsNullOrWhiteSpace(pageBrand) ? pageBrand : ExtractBrand(vehicleTypeRaw); + + var vehicle = new HsnTsnVehicle + { + Hsn = match.Groups["hsn"].Value, + Tsn = match.Groups["tsn"].Value, + HsnTsn = $"{match.Groups["hsn"].Value}/{match.Groups["tsn"].Value}", + Brand = brand, + VehicleType = vehicleTypeRaw, + PowerPs = ps, + PowerKw = kw, + DisplacementCcm = displacementCcm, + FuelType = fuelType, + SourceListUrl = sourceListUrl, + SourceDetailUrl = detailUrl, + SourceQuery = sourceQuery ?? string.Empty + }; + + vehicle.MatchKey = BuildMatchKey(vehicle); + result.Add(vehicle); + } + + return result; + } + + private string ToAbsoluteUrl(string href) + { + if (string.IsNullOrWhiteSpace(href)) + { + return string.Empty; + } + + if (Uri.TryCreate(href, UriKind.Absolute, out var absolute) && + (absolute.Scheme.Equals(Uri.UriSchemeHttp, StringComparison.OrdinalIgnoreCase) || + absolute.Scheme.Equals(Uri.UriSchemeHttps, StringComparison.OrdinalIgnoreCase))) + { + return absolute.ToString(); + } + + var normalized = href.StartsWith("/", StringComparison.Ordinal) ? href : $"/{href.TrimStart('/')}"; + return new Uri(_client.BaseAddress!, normalized).ToString(); + } + + private static HtmlDocument LoadDocument(string html) + { + var doc = new HtmlDocument(); + doc.LoadHtml(html); + return doc; + } + + private static string Clean(string value) + { + return HtmlEntity.DeEntitize(value).Replace('\u00A0', ' ').Trim(); + } + + private static void ParsePower(string powerText, out int? ps, out int? kw) + { + ps = null; + kw = null; + + var match = PsKwRegex.Match(powerText); + if (!match.Success) + { + return; + } + + if (int.TryParse(match.Groups["ps"].Value, out var psParsed)) + { + ps = psParsed; + } + + if (int.TryParse(match.Groups["kw"].Value, out var kwParsed)) + { + kw = kwParsed; + } + } + + private static int? ParseDisplacement(string text) + { + var match = CcmRegex.Match(text); + if (!match.Success) + { + return null; + } + + var numeric = match.Groups["ccm"].Value.Replace(".", string.Empty).Replace(",", string.Empty); + return int.TryParse(numeric, out var ccm) ? ccm : null; + } + + private static void ParseYearRange(string yearText, out int? fromYear, out int? toYear) + { + fromYear = null; + toYear = null; + + var match = YearRangeRegex.Match(yearText); + if (!match.Success) + { + return; + } + + if (int.TryParse(match.Groups["from"].Value, out var from)) + { + fromYear = from; + } + + if (int.TryParse(match.Groups["to"].Value, out var to)) + { + toYear = to; + } + } + + private static string ExtractBrand(string vehicleType) + { + if (string.IsNullOrWhiteSpace(vehicleType)) + { + return string.Empty; + } + + var parts = vehicleType.Split(' ', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries); + return parts.Length == 0 ? string.Empty : parts[0]; + } + + private static bool IsLikelyBrandHeading(string heading) + { + if (string.IsNullOrWhiteSpace(heading)) + { + return false; + } + + return !Regex.IsMatch(heading, @"^\d{4}$"); + } + + private static string BuildMatchKey(HsnTsnVehicle vehicle) + { + var raw = $"{vehicle.Brand} {vehicle.VehicleType} {vehicle.OfficialType} {vehicle.PowerKw} {vehicle.FuelType}".Trim(); + var normalized = raw.ToUpperInvariant() + .Replace("Ä", "AE") + .Replace("Ö", "OE") + .Replace("Ü", "UE") + .Replace("ß", "SS"); + + var cleaned = Regex.Replace(normalized, @"[^A-Z0-9]+", " ").Trim(); + return Regex.Replace(cleaned, @"\s+", " "); + } +} + +public sealed class VehicleDetail +{ + public string Hsn { get; set; } = string.Empty; + public string Tsn { get; set; } = string.Empty; + public string Brand { get; set; } = string.Empty; + public string Model { get; set; } = string.Empty; + public string OfficialType { get; set; } = string.Empty; + public int? YearFrom { get; set; } + public int? YearTo { get; set; } + public string CanonicalUrl { get; set; } = string.Empty; +} + +public sealed class HsnTsnVehicle +{ + public string HsnTsn { get; set; } = string.Empty; + public string Hsn { get; set; } = string.Empty; + public string Tsn { get; set; } = string.Empty; + public string Brand { get; set; } = string.Empty; + public string VehicleType { get; set; } = string.Empty; + public string Model { get; set; } = string.Empty; + public string OfficialType { get; set; } = string.Empty; + public int? YearFrom { get; set; } + public int? YearTo { get; set; } + public int? PowerPs { get; set; } + public int? PowerKw { get; set; } + public int? DisplacementCcm { get; set; } + public string FuelType { get; set; } = string.Empty; + public string MatchKey { get; set; } = string.Empty; + public string SourceQuery { get; set; } = string.Empty; + public string SourceListUrl { get; set; } = string.Empty; + public string SourceDetailUrl { get; set; } = string.Empty; +} diff --git a/src/HsnTsnScraper/HsnTsnScraper.csproj b/src/HsnTsnScraper/HsnTsnScraper.csproj new file mode 100644 index 0000000..1778750 --- /dev/null +++ b/src/HsnTsnScraper/HsnTsnScraper.csproj @@ -0,0 +1,17 @@ + + + + Exe + net9.0 + enable + enable + + + + + + + + + + diff --git a/src/HsnTsnScraper/Program.cs b/src/HsnTsnScraper/Program.cs new file mode 100644 index 0000000..3a62c42 --- /dev/null +++ b/src/HsnTsnScraper/Program.cs @@ -0,0 +1,163 @@ +using System.Globalization; +using CsvHelper; +using CsvHelper.Configuration; +using HsnTsnScraper; + +var includeDetails = !args.Contains("--skip-details", StringComparer.OrdinalIgnoreCase); + +using var client = new HsnTsnClient(); +var map = new Dictionary(StringComparer.OrdinalIgnoreCase); + +if (Console.IsInputRedirected) +{ + await foreach (var query in ReadInput()) + { + var vehicles = await client.GetVehiclesFromSearchAsync(query); + Merge(map, vehicles); + } +} +else +{ + var brandUrls = await client.GetBrandPageUrls(); + foreach (var url in brandUrls) + { + var vehicles = await client.GetVehiclesFromBrandPageAsync(url); + Merge(map, vehicles); + } +} + +if (includeDetails) +{ + foreach (var vehicle in map.Values) + { + var detail = await client.GetVehicleDetailAsync(vehicle.SourceDetailUrl); + if (detail is null) + { + continue; + } + + if (!string.IsNullOrWhiteSpace(detail.Brand)) + { + vehicle.Brand = detail.Brand; + } + + vehicle.Model = detail.Model; + vehicle.OfficialType = detail.OfficialType; + vehicle.YearFrom = detail.YearFrom; + vehicle.YearTo = detail.YearTo; + + if (!string.IsNullOrWhiteSpace(detail.CanonicalUrl)) + { + vehicle.SourceDetailUrl = detail.CanonicalUrl; + } + } +} + +foreach (var vehicle in map.Values) +{ + vehicle.MatchKey = BuildMatchKey(vehicle); +} + +await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture) +{ + Delimiter = ";" +}); + +csvWriter.WriteHeader(); +await csvWriter.NextRecordAsync(); + +foreach (var vehicle in map.Values.OrderBy(x => x.Hsn, StringComparer.OrdinalIgnoreCase).ThenBy(x => x.Tsn, StringComparer.OrdinalIgnoreCase)) +{ + csvWriter.WriteRecord(vehicle); + await csvWriter.NextRecordAsync(); +} + +await csvWriter.FlushAsync(); + +return; + +void Merge(IDictionary mapByHsnTsn, IEnumerable vehicles) +{ + foreach (var vehicle in vehicles) + { + var key = vehicle.HsnTsn; + if (!mapByHsnTsn.TryGetValue(key, out var existing)) + { + mapByHsnTsn[key] = vehicle; + continue; + } + + if (string.IsNullOrWhiteSpace(existing.VehicleType) && !string.IsNullOrWhiteSpace(vehicle.VehicleType)) + { + existing.VehicleType = vehicle.VehicleType; + } + + if (existing.PowerPs is null && vehicle.PowerPs is not null) + { + existing.PowerPs = vehicle.PowerPs; + } + + if (existing.PowerKw is null && vehicle.PowerKw is not null) + { + existing.PowerKw = vehicle.PowerKw; + } + + if (existing.DisplacementCcm is null && vehicle.DisplacementCcm is not null) + { + existing.DisplacementCcm = vehicle.DisplacementCcm; + } + + if (string.IsNullOrWhiteSpace(existing.FuelType) && !string.IsNullOrWhiteSpace(vehicle.FuelType)) + { + existing.FuelType = vehicle.FuelType; + } + + if (string.IsNullOrWhiteSpace(existing.SourceQuery) && !string.IsNullOrWhiteSpace(vehicle.SourceQuery)) + { + existing.SourceQuery = vehicle.SourceQuery; + } + + if (string.IsNullOrWhiteSpace(existing.SourceListUrl) && !string.IsNullOrWhiteSpace(vehicle.SourceListUrl)) + { + existing.SourceListUrl = vehicle.SourceListUrl; + } + + if (string.IsNullOrWhiteSpace(existing.SourceDetailUrl) && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl)) + { + existing.SourceDetailUrl = vehicle.SourceDetailUrl; + } + } +} + +async IAsyncEnumerable ReadInput() +{ + var seen = new HashSet(StringComparer.OrdinalIgnoreCase); + while (await Console.In.ReadLineAsync() is { } line) + { + var value = line.Trim(); + if (string.IsNullOrWhiteSpace(value)) + { + continue; + } + + if (!seen.Add(value)) + { + continue; + } + + yield return value; + } +} + +string BuildMatchKey(HsnTsnVehicle vehicle) +{ + var raw = $"{vehicle.Brand} {vehicle.VehicleType} {vehicle.Model} {vehicle.OfficialType} {vehicle.PowerKw} {vehicle.DisplacementCcm} {vehicle.FuelType}"; + var normalized = raw.ToUpperInvariant() + .Replace("Ä", "AE") + .Replace("Ö", "OE") + .Replace("Ü", "UE") + .Replace("ß", "SS"); + + normalized = System.Text.RegularExpressions.Regex.Replace(normalized, @"[^A-Z0-9]+", " ").Trim(); + return System.Text.RegularExpressions.Regex.Replace(normalized, @"\s+", " "); +}