using System.Net; using System.Text.RegularExpressions; using HtmlAgilityPack; using Microsoft.Extensions.Http; using Polly; using Polly.Extensions.Http; namespace HsnTsnScraper; public sealed class HsnTsnClient : IDisposable { private static readonly Regex HsnTsnRegex = new(@"(?\d{4})\s*/\s*(?[A-Z0-9]{3})", RegexOptions.Compiled); private static readonly Regex PsKwRegex = new(@"(?\d+)\s*PS\s*\((?\d+)\s*kW\)", RegexOptions.Compiled | RegexOptions.IgnoreCase); private static readonly Regex CcmRegex = new(@"(?[\d\.\,]+)\s*ccm", RegexOptions.Compiled | RegexOptions.IgnoreCase); private static readonly Regex YearRangeRegex = new(@"(?\d{4})(?:\D+(?\d{4}))?", RegexOptions.Compiled); private static readonly HashSet ExcludedLinks = new(StringComparer.OrdinalIgnoreCase) { "/impressum.php", "/datenschutz.php", "/datenschutz" }; private readonly HttpClient _client; public HsnTsnClient(string baseUrl = "http://www.hsn-tsn.de/") { var retryPolicy = HttpPolicyExtensions .HandleTransientHttpError() .OrResult(x => x.StatusCode == HttpStatusCode.TooManyRequests) .WaitAndRetryAsync(3, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt))); var socketHandler = new SocketsHttpHandler { PooledConnectionLifetime = TimeSpan.FromMinutes(5), UseCookies = false }; var pollyHandler = new PolicyHttpMessageHandler(retryPolicy) { InnerHandler = socketHandler }; _client = new HttpClient(pollyHandler) { BaseAddress = new Uri(baseUrl) }; _client.DefaultRequestHeaders.UserAgent.ParseAdd("Mozilla/5.0 (compatible; hsntsn-scraper/1.0)"); } public async Task> GetBrandPageUrls(CancellationToken cancellationToken = default) { var homeHtml = await GetStringAsync("/", cancellationToken); var doc = LoadDocument(homeHtml); var links = doc.DocumentNode.SelectNodes("//a[@href]") ?? new HtmlNodeCollection(null); var result = new HashSet(StringComparer.OrdinalIgnoreCase); foreach (var link in links) { var href = link.GetAttributeValue("href", string.Empty).Trim(); if (string.IsNullOrWhiteSpace(href)) { continue; } if (!href.EndsWith(".html", StringComparison.OrdinalIgnoreCase)) { continue; } if (ExcludedLinks.Contains(href)) { continue; } if (href.StartsWith("http://", StringComparison.OrdinalIgnoreCase) || href.StartsWith("https://", StringComparison.OrdinalIgnoreCase)) { result.Add(href); } else { result.Add(new Uri(_client.BaseAddress!, href).ToString()); } } return result.OrderBy(x => x, StringComparer.OrdinalIgnoreCase).ToArray(); } public async Task> GetVehiclesFromSearchAsync(string query, CancellationToken cancellationToken = default) { var encoded = Uri.EscapeDataString(query); var url = $"/liste.php?string={encoded}"; var html = await GetStringAsync(url, cancellationToken); return ParseVehiclesFromListPage(html, new Uri(_client.BaseAddress!, url).ToString(), query); } public async Task> GetVehiclesFromBrandPageAsync(string absoluteUrl, CancellationToken cancellationToken = default) { var html = await GetStringAsync(absoluteUrl, cancellationToken); return ParseVehiclesFromListPage(html, absoluteUrl, null); } public async Task GetVehicleDetailAsync(string detailUrl, CancellationToken cancellationToken = default) { if (string.IsNullOrWhiteSpace(detailUrl)) { return null; } var html = await GetStringAsync(detailUrl, cancellationToken); var doc = LoadDocument(html); var carNode = doc.DocumentNode.SelectSingleNode("//div[@id='car']"); if (carNode is null) { return null; } var detail = new VehicleDetail { Hsn = carNode.GetAttributeValue("data-hsn", string.Empty).Trim(), Tsn = carNode.GetAttributeValue("data-tsn", string.Empty).Trim(), Brand = HtmlEntity.DeEntitize(doc.DocumentNode.SelectSingleNode("//span[@property='brand']//span[@property='name']")?.InnerText ?? string.Empty).Trim(), Model = HtmlEntity.DeEntitize(doc.DocumentNode.SelectSingleNode("//span[@property='model']")?.InnerText ?? string.Empty).Trim(), OfficialType = HtmlEntity.DeEntitize(doc.DocumentNode.SelectSingleNode("//span[@property='alternateName']")?.InnerText ?? string.Empty).Trim(), CanonicalUrl = doc.DocumentNode.SelectSingleNode("//meta[@property='url']")?.GetAttributeValue("content", string.Empty).Trim() ?? string.Empty }; var yearTexts = new[] { HtmlEntity.DeEntitize(doc.DocumentNode.SelectSingleNode("//small[@property='vehicleModelDate']")?.InnerText ?? string.Empty).Trim(), HtmlEntity.DeEntitize(doc.DocumentNode.SelectSingleNode("//title")?.InnerText ?? string.Empty).Trim(), HtmlEntity.DeEntitize(doc.DocumentNode.SelectSingleNode("//h1")?.InnerText ?? string.Empty).Trim() }; ParseBestYearRange(yearTexts, out var fromYear, out var toYear); detail.YearFrom = fromYear; detail.YearTo = toYear; return detail; } public void Dispose() { _client.Dispose(); } private async Task GetStringAsync(string relativeOrAbsoluteUrl, CancellationToken cancellationToken) { using var response = await _client.GetAsync(relativeOrAbsoluteUrl, cancellationToken); response.EnsureSuccessStatusCode(); return await response.Content.ReadAsStringAsync(cancellationToken); } private IReadOnlyList ParseVehiclesFromListPage(string html, string sourceListUrl, string? sourceQuery) { var doc = LoadDocument(html); var rows = doc.DocumentNode.SelectNodes("//table//tr") ?? new HtmlNodeCollection(null); var pageHeading = Clean(doc.DocumentNode.SelectSingleNode("//h1")?.InnerText ?? string.Empty); var pageBrand = IsLikelyBrandHeading(pageHeading) ? pageHeading : string.Empty; var result = new List(); foreach (var row in rows) { var cells = row.SelectNodes("./td"); if (cells is null || cells.Count < 5) { continue; } var hsnTsnText = Clean(cells[0].InnerText); var match = HsnTsnRegex.Match(hsnTsnText); if (!match.Success) { continue; } var vehicleTypeRaw = Clean(cells[1].InnerText); var detailHref = cells[1].SelectSingleNode(".//a[@href]")?.GetAttributeValue("href", string.Empty) ?? string.Empty; var detailUrl = ToAbsoluteUrl(detailHref); var powerText = Clean(cells[2].InnerText); ParsePower(powerText, out var ps, out var kw); var displacementText = Clean(cells[3].InnerText); var displacementCcm = ParseDisplacement(displacementText); var fuelType = Clean(cells[4].InnerText); var brand = !string.IsNullOrWhiteSpace(pageBrand) ? pageBrand : ExtractBrand(vehicleTypeRaw); var vehicle = new HsnTsnVehicle { Hsn = match.Groups["hsn"].Value, Tsn = match.Groups["tsn"].Value, HsnTsn = $"{match.Groups["hsn"].Value}/{match.Groups["tsn"].Value}", Brand = brand, VehicleType = vehicleTypeRaw, PowerPs = ps, PowerKw = kw, DisplacementCcm = displacementCcm, FuelType = fuelType, SourceListUrl = sourceListUrl, SourceDetailUrl = detailUrl, SourceQuery = sourceQuery ?? string.Empty }; vehicle.MatchKey = BuildMatchKey(vehicle); result.Add(vehicle); } return result; } private string ToAbsoluteUrl(string href) { if (string.IsNullOrWhiteSpace(href)) { return string.Empty; } if (Uri.TryCreate(href, UriKind.Absolute, out var absolute) && (absolute.Scheme.Equals(Uri.UriSchemeHttp, StringComparison.OrdinalIgnoreCase) || absolute.Scheme.Equals(Uri.UriSchemeHttps, StringComparison.OrdinalIgnoreCase))) { return absolute.ToString(); } var normalized = href.StartsWith("/", StringComparison.Ordinal) ? href : $"/{href.TrimStart('/')}"; return new Uri(_client.BaseAddress!, normalized).ToString(); } private static HtmlDocument LoadDocument(string html) { var doc = new HtmlDocument(); doc.LoadHtml(html); return doc; } private static string Clean(string value) { return HtmlEntity.DeEntitize(value).Replace('\u00A0', ' ').Trim(); } private static void ParsePower(string powerText, out int? ps, out int? kw) { ps = null; kw = null; var match = PsKwRegex.Match(powerText); if (!match.Success) { return; } if (int.TryParse(match.Groups["ps"].Value, out var psParsed)) { ps = psParsed; } if (int.TryParse(match.Groups["kw"].Value, out var kwParsed)) { kw = kwParsed; } } private static int? ParseDisplacement(string text) { var match = CcmRegex.Match(text); if (!match.Success) { return null; } var numeric = match.Groups["ccm"].Value.Replace(".", string.Empty).Replace(",", string.Empty); return int.TryParse(numeric, out var ccm) ? ccm : null; } private static void ParseYearRange(string yearText, out int? fromYear, out int? toYear) { fromYear = null; toYear = null; var match = YearRangeRegex.Match(yearText); if (!match.Success) { return; } if (int.TryParse(match.Groups["from"].Value, out var from)) { fromYear = from; } if (int.TryParse(match.Groups["to"].Value, out var to)) { toYear = to; } } private static void ParseBestYearRange(IEnumerable candidates, out int? fromYear, out int? toYear) { fromYear = null; toYear = null; foreach (var text in candidates) { if (string.IsNullOrWhiteSpace(text)) { continue; } ParseYearRange(text, out var currentFrom, out var currentTo); if (currentFrom is null) { continue; } if (fromYear is null) { fromYear = currentFrom; toYear = currentTo; continue; } // Prefer a range that includes both endpoints over a single year. if (toYear is null && currentTo is not null) { fromYear = currentFrom; toYear = currentTo; } } } private static string ExtractBrand(string vehicleType) { if (string.IsNullOrWhiteSpace(vehicleType)) { return string.Empty; } var parts = vehicleType.Split(' ', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries); return parts.Length == 0 ? string.Empty : parts[0]; } private static bool IsLikelyBrandHeading(string heading) { if (string.IsNullOrWhiteSpace(heading)) { return false; } return !Regex.IsMatch(heading, @"^\d{4}$"); } private static string BuildMatchKey(HsnTsnVehicle vehicle) { var raw = $"{vehicle.Brand} {vehicle.VehicleType} {vehicle.OfficialType} {vehicle.PowerKw} {vehicle.FuelType}".Trim(); var normalized = raw.ToUpperInvariant() .Replace("Ä", "AE") .Replace("Ö", "OE") .Replace("Ü", "UE") .Replace("ß", "SS"); var cleaned = Regex.Replace(normalized, @"[^A-Z0-9]+", " ").Trim(); return Regex.Replace(cleaned, @"\s+", " "); } } public sealed class VehicleDetail { public string Hsn { get; set; } = string.Empty; public string Tsn { get; set; } = string.Empty; public string Brand { get; set; } = string.Empty; public string Model { get; set; } = string.Empty; public string OfficialType { get; set; } = string.Empty; public int? YearFrom { get; set; } public int? YearTo { get; set; } public string CanonicalUrl { get; set; } = string.Empty; } public sealed class HsnTsnVehicle { public string HsnTsn { get; set; } = string.Empty; public string Hsn { get; set; } = string.Empty; public string Tsn { get; set; } = string.Empty; public string Brand { get; set; } = string.Empty; public string VehicleType { get; set; } = string.Empty; public string Model { get; set; } = string.Empty; public string OfficialType { get; set; } = string.Empty; public int? YearFrom { get; set; } public int? YearTo { get; set; } public int? PowerPs { get; set; } public int? PowerKw { get; set; } public int? DisplacementCcm { get; set; } public string FuelType { get; set; } = string.Empty; public string MatchKey { get; set; } = string.Empty; public string SourceQuery { get; set; } = string.Empty; public string SourceListUrl { get; set; } = string.Empty; public string SourceDetailUrl { get; set; } = string.Empty; }