Files
hsntsn-scraper/src/HsnTsnScraper/HsnTsnClient.cs
T

402 lines
14 KiB
C#

using System.Net;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
using Microsoft.Extensions.Http;
using Polly;
using Polly.Extensions.Http;
namespace HsnTsnScraper;
public sealed class HsnTsnClient : IDisposable
{
private static readonly Regex HsnTsnRegex = new(@"(?<hsn>\d{4})\s*/\s*(?<tsn>[A-Z0-9]{3})", RegexOptions.Compiled);
private static readonly Regex PsKwRegex = new(@"(?<ps>\d+)\s*PS\s*\((?<kw>\d+)\s*kW\)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private static readonly Regex CcmRegex = new(@"(?<ccm>[\d\.\,]+)\s*ccm", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private static readonly Regex YearRangeRegex = new(@"(?<from>\d{4})(?:\D+(?<to>\d{4}))?", RegexOptions.Compiled);
private static readonly HashSet<string> ExcludedLinks = new(StringComparer.OrdinalIgnoreCase)
{
"/impressum.php",
"/datenschutz.php",
"/datenschutz"
};
private readonly HttpClient _client;
public HsnTsnClient(string baseUrl = "http://www.hsn-tsn.de/")
{
var retryPolicy = HttpPolicyExtensions
.HandleTransientHttpError()
.OrResult(x => x.StatusCode == HttpStatusCode.TooManyRequests)
.WaitAndRetryAsync(3, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt)));
var socketHandler = new SocketsHttpHandler
{
PooledConnectionLifetime = TimeSpan.FromMinutes(5),
UseCookies = false
};
var pollyHandler = new PolicyHttpMessageHandler(retryPolicy)
{
InnerHandler = socketHandler
};
_client = new HttpClient(pollyHandler)
{
BaseAddress = new Uri(baseUrl)
};
_client.DefaultRequestHeaders.UserAgent.ParseAdd("Mozilla/5.0 (compatible; hsntsn-scraper/1.0)");
}
public async Task<IReadOnlyList<string>> GetBrandPageUrls(CancellationToken cancellationToken = default)
{
var homeHtml = await GetStringAsync("/", cancellationToken);
var doc = LoadDocument(homeHtml);
var links = doc.DocumentNode.SelectNodes("//a[@href]") ?? new HtmlNodeCollection(null);
var result = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
foreach (var link in links)
{
var href = link.GetAttributeValue("href", string.Empty).Trim();
if (string.IsNullOrWhiteSpace(href))
{
continue;
}
if (!href.EndsWith(".html", StringComparison.OrdinalIgnoreCase))
{
continue;
}
if (ExcludedLinks.Contains(href))
{
continue;
}
if (href.StartsWith("http://", StringComparison.OrdinalIgnoreCase) ||
href.StartsWith("https://", StringComparison.OrdinalIgnoreCase))
{
result.Add(href);
}
else
{
result.Add(new Uri(_client.BaseAddress!, href).ToString());
}
}
return result.OrderBy(x => x, StringComparer.OrdinalIgnoreCase).ToArray();
}
public async Task<IReadOnlyList<HsnTsnVehicle>> GetVehiclesFromSearchAsync(string query, CancellationToken cancellationToken = default)
{
var encoded = Uri.EscapeDataString(query);
var url = $"/liste.php?string={encoded}";
var html = await GetStringAsync(url, cancellationToken);
return ParseVehiclesFromListPage(html, new Uri(_client.BaseAddress!, url).ToString(), query);
}
public async Task<IReadOnlyList<HsnTsnVehicle>> GetVehiclesFromBrandPageAsync(string absoluteUrl, CancellationToken cancellationToken = default)
{
var html = await GetStringAsync(absoluteUrl, cancellationToken);
return ParseVehiclesFromListPage(html, absoluteUrl, null);
}
public async Task<VehicleDetail?> GetVehicleDetailAsync(string detailUrl, CancellationToken cancellationToken = default)
{
if (string.IsNullOrWhiteSpace(detailUrl))
{
return null;
}
var html = await GetStringAsync(detailUrl, cancellationToken);
var doc = LoadDocument(html);
var carNode = doc.DocumentNode.SelectSingleNode("//div[@id='car']");
if (carNode is null)
{
return null;
}
var detail = new VehicleDetail
{
Hsn = carNode.GetAttributeValue("data-hsn", string.Empty).Trim(),
Tsn = carNode.GetAttributeValue("data-tsn", string.Empty).Trim(),
Brand = HtmlEntity.DeEntitize(doc.DocumentNode.SelectSingleNode("//span[@property='brand']//span[@property='name']")?.InnerText ?? string.Empty).Trim(),
Model = HtmlEntity.DeEntitize(doc.DocumentNode.SelectSingleNode("//span[@property='model']")?.InnerText ?? string.Empty).Trim(),
OfficialType = HtmlEntity.DeEntitize(doc.DocumentNode.SelectSingleNode("//span[@property='alternateName']")?.InnerText ?? string.Empty).Trim(),
CanonicalUrl = doc.DocumentNode.SelectSingleNode("//meta[@property='url']")?.GetAttributeValue("content", string.Empty).Trim() ?? string.Empty
};
var yearTexts = new[]
{
HtmlEntity.DeEntitize(doc.DocumentNode.SelectSingleNode("//small[@property='vehicleModelDate']")?.InnerText ?? string.Empty).Trim(),
HtmlEntity.DeEntitize(doc.DocumentNode.SelectSingleNode("//title")?.InnerText ?? string.Empty).Trim(),
HtmlEntity.DeEntitize(doc.DocumentNode.SelectSingleNode("//h1")?.InnerText ?? string.Empty).Trim()
};
ParseBestYearRange(yearTexts, out var fromYear, out var toYear);
detail.YearFrom = fromYear;
detail.YearTo = toYear;
return detail;
}
public void Dispose()
{
_client.Dispose();
}
private async Task<string> GetStringAsync(string relativeOrAbsoluteUrl, CancellationToken cancellationToken)
{
using var response = await _client.GetAsync(relativeOrAbsoluteUrl, cancellationToken);
response.EnsureSuccessStatusCode();
return await response.Content.ReadAsStringAsync(cancellationToken);
}
private IReadOnlyList<HsnTsnVehicle> ParseVehiclesFromListPage(string html, string sourceListUrl, string? sourceQuery)
{
var doc = LoadDocument(html);
var rows = doc.DocumentNode.SelectNodes("//table//tr") ?? new HtmlNodeCollection(null);
var pageHeading = Clean(doc.DocumentNode.SelectSingleNode("//h1")?.InnerText ?? string.Empty);
var pageBrand = IsLikelyBrandHeading(pageHeading) ? pageHeading : string.Empty;
var result = new List<HsnTsnVehicle>();
foreach (var row in rows)
{
var cells = row.SelectNodes("./td");
if (cells is null || cells.Count < 5)
{
continue;
}
var hsnTsnText = Clean(cells[0].InnerText);
var match = HsnTsnRegex.Match(hsnTsnText);
if (!match.Success)
{
continue;
}
var vehicleTypeRaw = Clean(cells[1].InnerText);
var detailHref = cells[1].SelectSingleNode(".//a[@href]")?.GetAttributeValue("href", string.Empty) ?? string.Empty;
var detailUrl = ToAbsoluteUrl(detailHref);
var powerText = Clean(cells[2].InnerText);
ParsePower(powerText, out var ps, out var kw);
var displacementText = Clean(cells[3].InnerText);
var displacementCcm = ParseDisplacement(displacementText);
var fuelType = Clean(cells[4].InnerText);
var brand = !string.IsNullOrWhiteSpace(pageBrand) ? pageBrand : ExtractBrand(vehicleTypeRaw);
var vehicle = new HsnTsnVehicle
{
Hsn = match.Groups["hsn"].Value,
Tsn = match.Groups["tsn"].Value,
HsnTsn = $"{match.Groups["hsn"].Value}/{match.Groups["tsn"].Value}",
Brand = brand,
VehicleType = vehicleTypeRaw,
PowerPs = ps,
PowerKw = kw,
DisplacementCcm = displacementCcm,
FuelType = fuelType,
SourceListUrl = sourceListUrl,
SourceDetailUrl = detailUrl,
SourceQuery = sourceQuery ?? string.Empty
};
vehicle.MatchKey = BuildMatchKey(vehicle);
result.Add(vehicle);
}
return result;
}
private string ToAbsoluteUrl(string href)
{
if (string.IsNullOrWhiteSpace(href))
{
return string.Empty;
}
if (Uri.TryCreate(href, UriKind.Absolute, out var absolute) &&
(absolute.Scheme.Equals(Uri.UriSchemeHttp, StringComparison.OrdinalIgnoreCase) ||
absolute.Scheme.Equals(Uri.UriSchemeHttps, StringComparison.OrdinalIgnoreCase)))
{
return absolute.ToString();
}
var normalized = href.StartsWith("/", StringComparison.Ordinal) ? href : $"/{href.TrimStart('/')}";
return new Uri(_client.BaseAddress!, normalized).ToString();
}
private static HtmlDocument LoadDocument(string html)
{
var doc = new HtmlDocument();
doc.LoadHtml(html);
return doc;
}
private static string Clean(string value)
{
return HtmlEntity.DeEntitize(value).Replace('\u00A0', ' ').Trim();
}
private static void ParsePower(string powerText, out int? ps, out int? kw)
{
ps = null;
kw = null;
var match = PsKwRegex.Match(powerText);
if (!match.Success)
{
return;
}
if (int.TryParse(match.Groups["ps"].Value, out var psParsed))
{
ps = psParsed;
}
if (int.TryParse(match.Groups["kw"].Value, out var kwParsed))
{
kw = kwParsed;
}
}
private static int? ParseDisplacement(string text)
{
var match = CcmRegex.Match(text);
if (!match.Success)
{
return null;
}
var numeric = match.Groups["ccm"].Value.Replace(".", string.Empty).Replace(",", string.Empty);
return int.TryParse(numeric, out var ccm) ? ccm : null;
}
private static void ParseYearRange(string yearText, out int? fromYear, out int? toYear)
{
fromYear = null;
toYear = null;
var match = YearRangeRegex.Match(yearText);
if (!match.Success)
{
return;
}
if (int.TryParse(match.Groups["from"].Value, out var from))
{
fromYear = from;
}
if (int.TryParse(match.Groups["to"].Value, out var to))
{
toYear = to;
}
}
private static void ParseBestYearRange(IEnumerable<string> candidates, out int? fromYear, out int? toYear)
{
fromYear = null;
toYear = null;
foreach (var text in candidates)
{
if (string.IsNullOrWhiteSpace(text))
{
continue;
}
ParseYearRange(text, out var currentFrom, out var currentTo);
if (currentFrom is null)
{
continue;
}
if (fromYear is null)
{
fromYear = currentFrom;
toYear = currentTo;
continue;
}
// Prefer a range that includes both endpoints over a single year.
if (toYear is null && currentTo is not null)
{
fromYear = currentFrom;
toYear = currentTo;
}
}
}
private static string ExtractBrand(string vehicleType)
{
if (string.IsNullOrWhiteSpace(vehicleType))
{
return string.Empty;
}
var parts = vehicleType.Split(' ', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
return parts.Length == 0 ? string.Empty : parts[0];
}
private static bool IsLikelyBrandHeading(string heading)
{
if (string.IsNullOrWhiteSpace(heading))
{
return false;
}
return !Regex.IsMatch(heading, @"^\d{4}$");
}
private static string BuildMatchKey(HsnTsnVehicle vehicle)
{
var raw = $"{vehicle.Brand} {vehicle.VehicleType} {vehicle.OfficialType} {vehicle.PowerKw} {vehicle.FuelType}".Trim();
var normalized = raw.ToUpperInvariant()
.Replace("Ä", "AE")
.Replace("Ö", "OE")
.Replace("Ü", "UE")
.Replace("ß", "SS");
var cleaned = Regex.Replace(normalized, @"[^A-Z0-9]+", " ").Trim();
return Regex.Replace(cleaned, @"\s+", " ");
}
}
public sealed class VehicleDetail
{
public string Hsn { get; set; } = string.Empty;
public string Tsn { get; set; } = string.Empty;
public string Brand { get; set; } = string.Empty;
public string Model { get; set; } = string.Empty;
public string OfficialType { get; set; } = string.Empty;
public int? YearFrom { get; set; }
public int? YearTo { get; set; }
public string CanonicalUrl { get; set; } = string.Empty;
}
public sealed class HsnTsnVehicle
{
public string HsnTsn { get; set; } = string.Empty;
public string Hsn { get; set; } = string.Empty;
public string Tsn { get; set; } = string.Empty;
public string Brand { get; set; } = string.Empty;
public string VehicleType { get; set; } = string.Empty;
public string Model { get; set; } = string.Empty;
public string OfficialType { get; set; } = string.Empty;
public int? YearFrom { get; set; }
public int? YearTo { get; set; }
public int? PowerPs { get; set; }
public int? PowerKw { get; set; }
public int? DisplacementCcm { get; set; }
public string FuelType { get; set; } = string.Empty;
public string MatchKey { get; set; } = string.Empty;
public string SourceQuery { get; set; } = string.Empty;
public string SourceListUrl { get; set; } = string.Empty;
public string SourceDetailUrl { get; set; } = string.Empty;
}