Add HsnTsnClient and related classes for vehicle data scraping
- Implement HsnTsnClient to fetch vehicle data from hsn-tsn.de - Create VehicleDetail and HsnTsnVehicle classes for data representation - Add CSV output functionality in Program.cs - Include necessary NuGet packages in project file - Add README with usage instructions
This commit is contained in:
@@ -0,0 +1,3 @@
|
|||||||
|
bin/
|
||||||
|
obj/
|
||||||
|
.idea/
|
||||||
@@ -0,0 +1,34 @@
|
|||||||
|
# hsntsn-scraper
|
||||||
|
|
||||||
|
.NET console scraper.
|
||||||
|
|
||||||
|
Kaynak: `http://www.hsn-tsn.de/`
|
||||||
|
|
||||||
|
CSV cikti alanlari:
|
||||||
|
|
||||||
|
- `HsnTsn`, `Hsn`, `Tsn`
|
||||||
|
- `Brand`, `VehicleType`, `Model`, `OfficialType`
|
||||||
|
- `YearFrom`, `YearTo`
|
||||||
|
- `PowerPs`, `PowerKw`, `DisplacementCcm`, `FuelType`
|
||||||
|
- `MatchKey`
|
||||||
|
- `SourceQuery`, `SourceListUrl`, `SourceDetailUrl`
|
||||||
|
|
||||||
|
## Calistirma
|
||||||
|
|
||||||
|
Tum marka sayfalarini tara:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
Sadece verilen sorgulari tara (`stdin`):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
printf "0588\nGolf\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj > hsntsn.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
Detay sayfasi zenginlestirmesini kapat:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
printf "0588\n" | dotnet run --project src/HsnTsnScraper/HsnTsnScraper.csproj -- --skip-details
|
||||||
|
```
|
||||||
@@ -0,0 +1,39 @@
|
|||||||
|
|
||||||
|
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||||
|
# Visual Studio Version 17
|
||||||
|
VisualStudioVersion = 17.0.31903.59
|
||||||
|
MinimumVisualStudioVersion = 10.0.40219.1
|
||||||
|
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{827E0CD3-B72D-47B6-A68D-7590B98EB39B}"
|
||||||
|
EndProject
|
||||||
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HsnTsnScraper", "src\HsnTsnScraper\HsnTsnScraper.csproj", "{F5352DDD-D3DE-4550-A50B-B28C00AA6785}"
|
||||||
|
EndProject
|
||||||
|
Global
|
||||||
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
|
Debug|Any CPU = Debug|Any CPU
|
||||||
|
Debug|x64 = Debug|x64
|
||||||
|
Debug|x86 = Debug|x86
|
||||||
|
Release|Any CPU = Release|Any CPU
|
||||||
|
Release|x64 = Release|x64
|
||||||
|
Release|x86 = Release|x86
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||||
|
{F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||||
|
{F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||||
|
{F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||||
|
{F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Debug|x64.Build.0 = Debug|Any CPU
|
||||||
|
{F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Debug|x86.ActiveCfg = Debug|Any CPU
|
||||||
|
{F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Debug|x86.Build.0 = Debug|Any CPU
|
||||||
|
{F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
|
{F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
|
{F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Release|x64.ActiveCfg = Release|Any CPU
|
||||||
|
{F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Release|x64.Build.0 = Release|Any CPU
|
||||||
|
{F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Release|x86.ActiveCfg = Release|Any CPU
|
||||||
|
{F5352DDD-D3DE-4550-A50B-B28C00AA6785}.Release|x86.Build.0 = Release|Any CPU
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
|
HideSolutionNode = FALSE
|
||||||
|
EndGlobalSection
|
||||||
|
GlobalSection(NestedProjects) = preSolution
|
||||||
|
{F5352DDD-D3DE-4550-A50B-B28C00AA6785} = {827E0CD3-B72D-47B6-A68D-7590B98EB39B}
|
||||||
|
EndGlobalSection
|
||||||
|
EndGlobal
|
||||||
@@ -0,0 +1,361 @@
|
|||||||
|
using System.Net;
|
||||||
|
using System.Text.RegularExpressions;
|
||||||
|
using HtmlAgilityPack;
|
||||||
|
using Microsoft.Extensions.Http;
|
||||||
|
using Polly;
|
||||||
|
using Polly.Extensions.Http;
|
||||||
|
|
||||||
|
namespace HsnTsnScraper;
|
||||||
|
|
||||||
|
public sealed class HsnTsnClient : IDisposable
|
||||||
|
{
|
||||||
|
private static readonly Regex HsnTsnRegex = new(@"(?<hsn>\d{4})\s*/\s*(?<tsn>[A-Z0-9]{3})", RegexOptions.Compiled);
|
||||||
|
private static readonly Regex PsKwRegex = new(@"(?<ps>\d+)\s*PS\s*\((?<kw>\d+)\s*kW\)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
|
||||||
|
private static readonly Regex CcmRegex = new(@"(?<ccm>[\d\.\,]+)\s*ccm", RegexOptions.Compiled | RegexOptions.IgnoreCase);
|
||||||
|
private static readonly Regex YearRangeRegex = new(@"(?<from>\d{4})(?:\D+(?<to>\d{4}))?", RegexOptions.Compiled);
|
||||||
|
private static readonly HashSet<string> ExcludedLinks = new(StringComparer.OrdinalIgnoreCase)
|
||||||
|
{
|
||||||
|
"/impressum.php",
|
||||||
|
"/datenschutz.php",
|
||||||
|
"/datenschutz"
|
||||||
|
};
|
||||||
|
|
||||||
|
private readonly HttpClient _client;
|
||||||
|
|
||||||
|
public HsnTsnClient(string baseUrl = "http://www.hsn-tsn.de/")
|
||||||
|
{
|
||||||
|
var retryPolicy = HttpPolicyExtensions
|
||||||
|
.HandleTransientHttpError()
|
||||||
|
.OrResult(x => x.StatusCode == HttpStatusCode.TooManyRequests)
|
||||||
|
.WaitAndRetryAsync(3, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt)));
|
||||||
|
|
||||||
|
var socketHandler = new SocketsHttpHandler
|
||||||
|
{
|
||||||
|
PooledConnectionLifetime = TimeSpan.FromMinutes(5),
|
||||||
|
UseCookies = false
|
||||||
|
};
|
||||||
|
|
||||||
|
var pollyHandler = new PolicyHttpMessageHandler(retryPolicy)
|
||||||
|
{
|
||||||
|
InnerHandler = socketHandler
|
||||||
|
};
|
||||||
|
|
||||||
|
_client = new HttpClient(pollyHandler)
|
||||||
|
{
|
||||||
|
BaseAddress = new Uri(baseUrl)
|
||||||
|
};
|
||||||
|
|
||||||
|
_client.DefaultRequestHeaders.UserAgent.ParseAdd("Mozilla/5.0 (compatible; hsntsn-scraper/1.0)");
|
||||||
|
}
|
||||||
|
|
||||||
|
public async Task<IReadOnlyList<string>> GetBrandPageUrls(CancellationToken cancellationToken = default)
|
||||||
|
{
|
||||||
|
var homeHtml = await GetStringAsync("/", cancellationToken);
|
||||||
|
var doc = LoadDocument(homeHtml);
|
||||||
|
var links = doc.DocumentNode.SelectNodes("//a[@href]") ?? new HtmlNodeCollection(null);
|
||||||
|
|
||||||
|
var result = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||||
|
foreach (var link in links)
|
||||||
|
{
|
||||||
|
var href = link.GetAttributeValue("href", string.Empty).Trim();
|
||||||
|
if (string.IsNullOrWhiteSpace(href))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!href.EndsWith(".html", StringComparison.OrdinalIgnoreCase))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ExcludedLinks.Contains(href))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (href.StartsWith("http://", StringComparison.OrdinalIgnoreCase) ||
|
||||||
|
href.StartsWith("https://", StringComparison.OrdinalIgnoreCase))
|
||||||
|
{
|
||||||
|
result.Add(href);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
result.Add(new Uri(_client.BaseAddress!, href).ToString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result.OrderBy(x => x, StringComparer.OrdinalIgnoreCase).ToArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
public async Task<IReadOnlyList<HsnTsnVehicle>> GetVehiclesFromSearchAsync(string query, CancellationToken cancellationToken = default)
|
||||||
|
{
|
||||||
|
var encoded = Uri.EscapeDataString(query);
|
||||||
|
var url = $"/liste.php?string={encoded}";
|
||||||
|
var html = await GetStringAsync(url, cancellationToken);
|
||||||
|
return ParseVehiclesFromListPage(html, new Uri(_client.BaseAddress!, url).ToString(), query);
|
||||||
|
}
|
||||||
|
|
||||||
|
public async Task<IReadOnlyList<HsnTsnVehicle>> GetVehiclesFromBrandPageAsync(string absoluteUrl, CancellationToken cancellationToken = default)
|
||||||
|
{
|
||||||
|
var html = await GetStringAsync(absoluteUrl, cancellationToken);
|
||||||
|
return ParseVehiclesFromListPage(html, absoluteUrl, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public async Task<VehicleDetail?> GetVehicleDetailAsync(string detailUrl, CancellationToken cancellationToken = default)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrWhiteSpace(detailUrl))
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
var html = await GetStringAsync(detailUrl, cancellationToken);
|
||||||
|
var doc = LoadDocument(html);
|
||||||
|
|
||||||
|
var carNode = doc.DocumentNode.SelectSingleNode("//div[@id='car']");
|
||||||
|
if (carNode is null)
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
var detail = new VehicleDetail
|
||||||
|
{
|
||||||
|
Hsn = carNode.GetAttributeValue("data-hsn", string.Empty).Trim(),
|
||||||
|
Tsn = carNode.GetAttributeValue("data-tsn", string.Empty).Trim(),
|
||||||
|
Brand = HtmlEntity.DeEntitize(doc.DocumentNode.SelectSingleNode("//span[@property='brand']//span[@property='name']")?.InnerText ?? string.Empty).Trim(),
|
||||||
|
Model = HtmlEntity.DeEntitize(doc.DocumentNode.SelectSingleNode("//span[@property='model']")?.InnerText ?? string.Empty).Trim(),
|
||||||
|
OfficialType = HtmlEntity.DeEntitize(doc.DocumentNode.SelectSingleNode("//span[@property='alternateName']")?.InnerText ?? string.Empty).Trim(),
|
||||||
|
CanonicalUrl = doc.DocumentNode.SelectSingleNode("//meta[@property='url']")?.GetAttributeValue("content", string.Empty).Trim() ?? string.Empty
|
||||||
|
};
|
||||||
|
|
||||||
|
var yearText = HtmlEntity.DeEntitize(doc.DocumentNode.SelectSingleNode("//small[@property='vehicleModelDate']")?.InnerText ?? string.Empty).Trim();
|
||||||
|
ParseYearRange(yearText, out var fromYear, out var toYear);
|
||||||
|
detail.YearFrom = fromYear;
|
||||||
|
detail.YearTo = toYear;
|
||||||
|
|
||||||
|
return detail;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Dispose()
|
||||||
|
{
|
||||||
|
_client.Dispose();
|
||||||
|
}
|
||||||
|
|
||||||
|
private async Task<string> GetStringAsync(string relativeOrAbsoluteUrl, CancellationToken cancellationToken)
|
||||||
|
{
|
||||||
|
using var response = await _client.GetAsync(relativeOrAbsoluteUrl, cancellationToken);
|
||||||
|
response.EnsureSuccessStatusCode();
|
||||||
|
return await response.Content.ReadAsStringAsync(cancellationToken);
|
||||||
|
}
|
||||||
|
|
||||||
|
private IReadOnlyList<HsnTsnVehicle> ParseVehiclesFromListPage(string html, string sourceListUrl, string? sourceQuery)
|
||||||
|
{
|
||||||
|
var doc = LoadDocument(html);
|
||||||
|
var rows = doc.DocumentNode.SelectNodes("//table//tr") ?? new HtmlNodeCollection(null);
|
||||||
|
var pageHeading = Clean(doc.DocumentNode.SelectSingleNode("//h1")?.InnerText ?? string.Empty);
|
||||||
|
var pageBrand = IsLikelyBrandHeading(pageHeading) ? pageHeading : string.Empty;
|
||||||
|
var result = new List<HsnTsnVehicle>();
|
||||||
|
|
||||||
|
foreach (var row in rows)
|
||||||
|
{
|
||||||
|
var cells = row.SelectNodes("./td");
|
||||||
|
if (cells is null || cells.Count < 5)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var hsnTsnText = Clean(cells[0].InnerText);
|
||||||
|
var match = HsnTsnRegex.Match(hsnTsnText);
|
||||||
|
if (!match.Success)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var vehicleTypeRaw = Clean(cells[1].InnerText);
|
||||||
|
var detailHref = cells[1].SelectSingleNode(".//a[@href]")?.GetAttributeValue("href", string.Empty) ?? string.Empty;
|
||||||
|
var detailUrl = ToAbsoluteUrl(detailHref);
|
||||||
|
|
||||||
|
var powerText = Clean(cells[2].InnerText);
|
||||||
|
ParsePower(powerText, out var ps, out var kw);
|
||||||
|
|
||||||
|
var displacementText = Clean(cells[3].InnerText);
|
||||||
|
var displacementCcm = ParseDisplacement(displacementText);
|
||||||
|
|
||||||
|
var fuelType = Clean(cells[4].InnerText);
|
||||||
|
var brand = !string.IsNullOrWhiteSpace(pageBrand) ? pageBrand : ExtractBrand(vehicleTypeRaw);
|
||||||
|
|
||||||
|
var vehicle = new HsnTsnVehicle
|
||||||
|
{
|
||||||
|
Hsn = match.Groups["hsn"].Value,
|
||||||
|
Tsn = match.Groups["tsn"].Value,
|
||||||
|
HsnTsn = $"{match.Groups["hsn"].Value}/{match.Groups["tsn"].Value}",
|
||||||
|
Brand = brand,
|
||||||
|
VehicleType = vehicleTypeRaw,
|
||||||
|
PowerPs = ps,
|
||||||
|
PowerKw = kw,
|
||||||
|
DisplacementCcm = displacementCcm,
|
||||||
|
FuelType = fuelType,
|
||||||
|
SourceListUrl = sourceListUrl,
|
||||||
|
SourceDetailUrl = detailUrl,
|
||||||
|
SourceQuery = sourceQuery ?? string.Empty
|
||||||
|
};
|
||||||
|
|
||||||
|
vehicle.MatchKey = BuildMatchKey(vehicle);
|
||||||
|
result.Add(vehicle);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private string ToAbsoluteUrl(string href)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrWhiteSpace(href))
|
||||||
|
{
|
||||||
|
return string.Empty;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Uri.TryCreate(href, UriKind.Absolute, out var absolute) &&
|
||||||
|
(absolute.Scheme.Equals(Uri.UriSchemeHttp, StringComparison.OrdinalIgnoreCase) ||
|
||||||
|
absolute.Scheme.Equals(Uri.UriSchemeHttps, StringComparison.OrdinalIgnoreCase)))
|
||||||
|
{
|
||||||
|
return absolute.ToString();
|
||||||
|
}
|
||||||
|
|
||||||
|
var normalized = href.StartsWith("/", StringComparison.Ordinal) ? href : $"/{href.TrimStart('/')}";
|
||||||
|
return new Uri(_client.BaseAddress!, normalized).ToString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static HtmlDocument LoadDocument(string html)
|
||||||
|
{
|
||||||
|
var doc = new HtmlDocument();
|
||||||
|
doc.LoadHtml(html);
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string Clean(string value)
|
||||||
|
{
|
||||||
|
return HtmlEntity.DeEntitize(value).Replace('\u00A0', ' ').Trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void ParsePower(string powerText, out int? ps, out int? kw)
|
||||||
|
{
|
||||||
|
ps = null;
|
||||||
|
kw = null;
|
||||||
|
|
||||||
|
var match = PsKwRegex.Match(powerText);
|
||||||
|
if (!match.Success)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (int.TryParse(match.Groups["ps"].Value, out var psParsed))
|
||||||
|
{
|
||||||
|
ps = psParsed;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (int.TryParse(match.Groups["kw"].Value, out var kwParsed))
|
||||||
|
{
|
||||||
|
kw = kwParsed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int? ParseDisplacement(string text)
|
||||||
|
{
|
||||||
|
var match = CcmRegex.Match(text);
|
||||||
|
if (!match.Success)
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
var numeric = match.Groups["ccm"].Value.Replace(".", string.Empty).Replace(",", string.Empty);
|
||||||
|
return int.TryParse(numeric, out var ccm) ? ccm : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void ParseYearRange(string yearText, out int? fromYear, out int? toYear)
|
||||||
|
{
|
||||||
|
fromYear = null;
|
||||||
|
toYear = null;
|
||||||
|
|
||||||
|
var match = YearRangeRegex.Match(yearText);
|
||||||
|
if (!match.Success)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (int.TryParse(match.Groups["from"].Value, out var from))
|
||||||
|
{
|
||||||
|
fromYear = from;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (int.TryParse(match.Groups["to"].Value, out var to))
|
||||||
|
{
|
||||||
|
toYear = to;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string ExtractBrand(string vehicleType)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrWhiteSpace(vehicleType))
|
||||||
|
{
|
||||||
|
return string.Empty;
|
||||||
|
}
|
||||||
|
|
||||||
|
var parts = vehicleType.Split(' ', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
|
||||||
|
return parts.Length == 0 ? string.Empty : parts[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
private static bool IsLikelyBrandHeading(string heading)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrWhiteSpace(heading))
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return !Regex.IsMatch(heading, @"^\d{4}$");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static string BuildMatchKey(HsnTsnVehicle vehicle)
|
||||||
|
{
|
||||||
|
var raw = $"{vehicle.Brand} {vehicle.VehicleType} {vehicle.OfficialType} {vehicle.PowerKw} {vehicle.FuelType}".Trim();
|
||||||
|
var normalized = raw.ToUpperInvariant()
|
||||||
|
.Replace("Ä", "AE")
|
||||||
|
.Replace("Ö", "OE")
|
||||||
|
.Replace("Ü", "UE")
|
||||||
|
.Replace("ß", "SS");
|
||||||
|
|
||||||
|
var cleaned = Regex.Replace(normalized, @"[^A-Z0-9]+", " ").Trim();
|
||||||
|
return Regex.Replace(cleaned, @"\s+", " ");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class VehicleDetail
|
||||||
|
{
|
||||||
|
public string Hsn { get; set; } = string.Empty;
|
||||||
|
public string Tsn { get; set; } = string.Empty;
|
||||||
|
public string Brand { get; set; } = string.Empty;
|
||||||
|
public string Model { get; set; } = string.Empty;
|
||||||
|
public string OfficialType { get; set; } = string.Empty;
|
||||||
|
public int? YearFrom { get; set; }
|
||||||
|
public int? YearTo { get; set; }
|
||||||
|
public string CanonicalUrl { get; set; } = string.Empty;
|
||||||
|
}
|
||||||
|
|
||||||
|
public sealed class HsnTsnVehicle
|
||||||
|
{
|
||||||
|
public string HsnTsn { get; set; } = string.Empty;
|
||||||
|
public string Hsn { get; set; } = string.Empty;
|
||||||
|
public string Tsn { get; set; } = string.Empty;
|
||||||
|
public string Brand { get; set; } = string.Empty;
|
||||||
|
public string VehicleType { get; set; } = string.Empty;
|
||||||
|
public string Model { get; set; } = string.Empty;
|
||||||
|
public string OfficialType { get; set; } = string.Empty;
|
||||||
|
public int? YearFrom { get; set; }
|
||||||
|
public int? YearTo { get; set; }
|
||||||
|
public int? PowerPs { get; set; }
|
||||||
|
public int? PowerKw { get; set; }
|
||||||
|
public int? DisplacementCcm { get; set; }
|
||||||
|
public string FuelType { get; set; } = string.Empty;
|
||||||
|
public string MatchKey { get; set; } = string.Empty;
|
||||||
|
public string SourceQuery { get; set; } = string.Empty;
|
||||||
|
public string SourceListUrl { get; set; } = string.Empty;
|
||||||
|
public string SourceDetailUrl { get; set; } = string.Empty;
|
||||||
|
}
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<OutputType>Exe</OutputType>
|
||||||
|
<TargetFramework>net9.0</TargetFramework>
|
||||||
|
<ImplicitUsings>enable</ImplicitUsings>
|
||||||
|
<Nullable>enable</Nullable>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<PackageReference Include="CsvHelper" Version="33.0.1" />
|
||||||
|
<PackageReference Include="HtmlAgilityPack" Version="1.11.71" />
|
||||||
|
<PackageReference Include="Microsoft.Extensions.Http" Version="8.0.1" />
|
||||||
|
<PackageReference Include="Microsoft.Extensions.Http.Polly" Version="8.0.10" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
</Project>
|
||||||
@@ -0,0 +1,163 @@
|
|||||||
|
using System.Globalization;
|
||||||
|
using CsvHelper;
|
||||||
|
using CsvHelper.Configuration;
|
||||||
|
using HsnTsnScraper;
|
||||||
|
|
||||||
|
var includeDetails = !args.Contains("--skip-details", StringComparer.OrdinalIgnoreCase);
|
||||||
|
|
||||||
|
using var client = new HsnTsnClient();
|
||||||
|
var map = new Dictionary<string, HsnTsnVehicle>(StringComparer.OrdinalIgnoreCase);
|
||||||
|
|
||||||
|
if (Console.IsInputRedirected)
|
||||||
|
{
|
||||||
|
await foreach (var query in ReadInput())
|
||||||
|
{
|
||||||
|
var vehicles = await client.GetVehiclesFromSearchAsync(query);
|
||||||
|
Merge(map, vehicles);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
var brandUrls = await client.GetBrandPageUrls();
|
||||||
|
foreach (var url in brandUrls)
|
||||||
|
{
|
||||||
|
var vehicles = await client.GetVehiclesFromBrandPageAsync(url);
|
||||||
|
Merge(map, vehicles);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (includeDetails)
|
||||||
|
{
|
||||||
|
foreach (var vehicle in map.Values)
|
||||||
|
{
|
||||||
|
var detail = await client.GetVehicleDetailAsync(vehicle.SourceDetailUrl);
|
||||||
|
if (detail is null)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!string.IsNullOrWhiteSpace(detail.Brand))
|
||||||
|
{
|
||||||
|
vehicle.Brand = detail.Brand;
|
||||||
|
}
|
||||||
|
|
||||||
|
vehicle.Model = detail.Model;
|
||||||
|
vehicle.OfficialType = detail.OfficialType;
|
||||||
|
vehicle.YearFrom = detail.YearFrom;
|
||||||
|
vehicle.YearTo = detail.YearTo;
|
||||||
|
|
||||||
|
if (!string.IsNullOrWhiteSpace(detail.CanonicalUrl))
|
||||||
|
{
|
||||||
|
vehicle.SourceDetailUrl = detail.CanonicalUrl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach (var vehicle in map.Values)
|
||||||
|
{
|
||||||
|
vehicle.MatchKey = BuildMatchKey(vehicle);
|
||||||
|
}
|
||||||
|
|
||||||
|
await using var csvWriter = new CsvWriter(Console.Out, new CsvConfiguration(CultureInfo.InvariantCulture)
|
||||||
|
{
|
||||||
|
Delimiter = ";"
|
||||||
|
});
|
||||||
|
|
||||||
|
csvWriter.WriteHeader<HsnTsnVehicle>();
|
||||||
|
await csvWriter.NextRecordAsync();
|
||||||
|
|
||||||
|
foreach (var vehicle in map.Values.OrderBy(x => x.Hsn, StringComparer.OrdinalIgnoreCase).ThenBy(x => x.Tsn, StringComparer.OrdinalIgnoreCase))
|
||||||
|
{
|
||||||
|
csvWriter.WriteRecord(vehicle);
|
||||||
|
await csvWriter.NextRecordAsync();
|
||||||
|
}
|
||||||
|
|
||||||
|
await csvWriter.FlushAsync();
|
||||||
|
|
||||||
|
return;
|
||||||
|
|
||||||
|
void Merge(IDictionary<string, HsnTsnVehicle> mapByHsnTsn, IEnumerable<HsnTsnVehicle> vehicles)
|
||||||
|
{
|
||||||
|
foreach (var vehicle in vehicles)
|
||||||
|
{
|
||||||
|
var key = vehicle.HsnTsn;
|
||||||
|
if (!mapByHsnTsn.TryGetValue(key, out var existing))
|
||||||
|
{
|
||||||
|
mapByHsnTsn[key] = vehicle;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (string.IsNullOrWhiteSpace(existing.VehicleType) && !string.IsNullOrWhiteSpace(vehicle.VehicleType))
|
||||||
|
{
|
||||||
|
existing.VehicleType = vehicle.VehicleType;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (existing.PowerPs is null && vehicle.PowerPs is not null)
|
||||||
|
{
|
||||||
|
existing.PowerPs = vehicle.PowerPs;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (existing.PowerKw is null && vehicle.PowerKw is not null)
|
||||||
|
{
|
||||||
|
existing.PowerKw = vehicle.PowerKw;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (existing.DisplacementCcm is null && vehicle.DisplacementCcm is not null)
|
||||||
|
{
|
||||||
|
existing.DisplacementCcm = vehicle.DisplacementCcm;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (string.IsNullOrWhiteSpace(existing.FuelType) && !string.IsNullOrWhiteSpace(vehicle.FuelType))
|
||||||
|
{
|
||||||
|
existing.FuelType = vehicle.FuelType;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (string.IsNullOrWhiteSpace(existing.SourceQuery) && !string.IsNullOrWhiteSpace(vehicle.SourceQuery))
|
||||||
|
{
|
||||||
|
existing.SourceQuery = vehicle.SourceQuery;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (string.IsNullOrWhiteSpace(existing.SourceListUrl) && !string.IsNullOrWhiteSpace(vehicle.SourceListUrl))
|
||||||
|
{
|
||||||
|
existing.SourceListUrl = vehicle.SourceListUrl;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (string.IsNullOrWhiteSpace(existing.SourceDetailUrl) && !string.IsNullOrWhiteSpace(vehicle.SourceDetailUrl))
|
||||||
|
{
|
||||||
|
existing.SourceDetailUrl = vehicle.SourceDetailUrl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async IAsyncEnumerable<string> ReadInput()
|
||||||
|
{
|
||||||
|
var seen = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||||
|
while (await Console.In.ReadLineAsync() is { } line)
|
||||||
|
{
|
||||||
|
var value = line.Trim();
|
||||||
|
if (string.IsNullOrWhiteSpace(value))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!seen.Add(value))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
yield return value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
string BuildMatchKey(HsnTsnVehicle vehicle)
|
||||||
|
{
|
||||||
|
var raw = $"{vehicle.Brand} {vehicle.VehicleType} {vehicle.Model} {vehicle.OfficialType} {vehicle.PowerKw} {vehicle.DisplacementCcm} {vehicle.FuelType}";
|
||||||
|
var normalized = raw.ToUpperInvariant()
|
||||||
|
.Replace("Ä", "AE")
|
||||||
|
.Replace("Ö", "OE")
|
||||||
|
.Replace("Ü", "UE")
|
||||||
|
.Replace("ß", "SS");
|
||||||
|
|
||||||
|
normalized = System.Text.RegularExpressions.Regex.Replace(normalized, @"[^A-Z0-9]+", " ").Trim();
|
||||||
|
return System.Text.RegularExpressions.Regex.Replace(normalized, @"\s+", " ");
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user