From 1cc312471ae0b97324613e099c6bd83b2de03ba5 Mon Sep 17 00:00:00 2001 From: Dmitry Chepurovskiy Date: Tue, 7 Jun 2022 00:55:06 +0300 Subject: [PATCH] WIP: search --- .../Indexers/Definitions/Lostfilm.cs | 182 ++++++++++++++++-- 1 file changed, 171 insertions(+), 11 deletions(-) diff --git a/src/NzbDrone.Core/Indexers/Definitions/Lostfilm.cs b/src/NzbDrone.Core/Indexers/Definitions/Lostfilm.cs index b0319a93cbc..fd4c2111270 100644 --- a/src/NzbDrone.Core/Indexers/Definitions/Lostfilm.cs +++ b/src/NzbDrone.Core/Indexers/Definitions/Lostfilm.cs @@ -2,12 +2,14 @@ using System.Collections.Generic; using System.Collections.Specialized; using System.Globalization; +using System.Linq; using System.Net.Http; using System.Text; using System.Text.RegularExpressions; using System.Threading.Tasks; using AngleSharp.Html.Parser; using FluentValidation; +using Newtonsoft.Json.Linq; using NLog; using NzbDrone.Common.Http; using NzbDrone.Core.Annotations; @@ -51,7 +53,7 @@ public Lostfilm(IIndexerHttpClient httpClient, IEventAggregator eventAggregator, public override IIndexerRequestGenerator GetRequestGenerator() { - return new LostfilmRequestGenerator() { Settings = Settings, Capabilities = Capabilities }; + return new LostfilmRequestGenerator() { Settings = Settings, Capabilities = Capabilities, HttpClient = _httpClient, Logger = _logger, Definition = Definition, Indexer = this }; } public override IParseIndexerResponse GetParser() @@ -166,33 +168,173 @@ public class LostfilmRequestGenerator : IIndexerRequestGenerator { public UserPassCaptchaTorrentBaseSettings Settings { get; set; } public IndexerCapabilities Capabilities { get; set; } + public IIndexerHttpClient HttpClient { get; set; } + public ProviderDefinition Definition { get; set; } + public Lostfilm Indexer { get; set; } + public Logger Logger { get; set; } public LostfilmRequestGenerator() { } - private IEnumerable GetPagedRequests(string term, int[] categories) + private IList GetSearchPageURLs(string term, int? season, string episode) + { + var urls = new List(); + /* + Torznab query for some series could contains sanitized title. E.g. "Star Wars: The Clone Wars" will become "Star Wars The Clone Wars". + Search API on LostFilm.tv doesn't return anything on such search query so the query should be "morphed" even for "tvsearch" queries. + Also the queries to Specials is a union of Series and Episode titles. E.g.: "Breaking Bad - El Camino: A Breaking Bad Movie". + The algorythm works in the following way: + 1. Search with the full SearchTerm. Just for example, let's search for episode by it's name + - {Star Wars The Clone Wars To Catch a Jedi} + 2. [loop] If none were found, repeat search with SearchTerm reduced by 1 word from the end. Fail search if no words left and no results were obtained + - {Star Wars The Clone Wars To Catch a} Jedi + - {Star Wars The Clone Wars To Catch} a Jedi + - ... + - {Star Wars} The Clone Wars To Catch a Jedi + 3. When we got few results, try to filter them with the words excluded before + - [Star Wars: The Clone Wars, Star Wars Rebels, Star Wars: Forces of Destiny] + .filterBy(The Clone Wars To Catch a Jedi) + 4. [loop] Reduce filterTerm by 1 word from the end. Fail search if no words left and no results were obtained + .filterBy(The Clone Wars To Catch a) / Jedi + .filterBy(The Clone Wars To Catch) / a Jedi + ... + .filterBy(The Clone Wars) / To Catch a Jedi + 5. [loop] Now we know that series we're looking for is called "Star Wars The Clone Wars". Fetch series detail page for it and try to apply remaining words as episode filter, reducing filter by 1 word each time we get no results: + - .episodes().filteredBy(To Catch a Jedi) + - .episodes().filteredBy(To Catch a) / Jedi + - ... + - .episodes() / To Catch a Jedi + Test queries: + - "Star Wars The Clone Wars To Catch a Jedi" -> S05E19 + - "Breaking Bad El Camino A Breaking Bad Movie" -> Special + - "The Magicians (2015)" -> Year should be ignored + */ + + // Search query words. Consists of Series keywords that will be used for series search request, and Episode keywords that will be used for episode filtering. + var keywords = new List(term.Split(' ')); + + // Keywords count related to Series Search. + var searchKeywords = keywords.Count; + + // Keywords count related to Series Filter. + var serieFilterKeywords = 0; + + // Overall (keywords.count - searchKeywords - serieFilterKeywords) are related to episode filter + do + { + var searchString = string.Join(" ", keywords.Take(searchKeywords)); + var data = new Dictionary + { + { "act", "common" }, + { "type", "search" }, + { "val", searchString } + }; + + var requestBuilder = new HttpRequestBuilder(Settings.BaseUrl + "ajaxik.php"); + foreach (var item in data) + { + requestBuilder.AddFormParameter(item.Key, item.Value); + } + + requestBuilder.PostProcess += r => r.RequestTimeout = TimeSpan.FromSeconds(15); + requestBuilder.SetCookies(Indexer.Cookies); + var req = new IndexerRequest(requestBuilder.Build()); + var response = new IndexerResponse(req, HttpClient.ExecuteProxied(req.HttpRequest, Definition)); + + if (response.Content == null) + { + continue; + } + + var json = JToken.Parse(response.Content); + if (json == null || json.Type == JTokenType.Array) + { + continue; // Search loop + } + + // Protect from {"data":false,"result":"ok"} + var jsonData = json["data"]; + if (jsonData.Type != JTokenType.Object) + { + continue; // Search loop + } + + var jsonSeries = jsonData["series"]; + if (jsonSeries == null || !jsonSeries.HasValues) + { + continue; // Search loop + } + + var series = jsonSeries.ToList(); + + // Filter found series + if (series.Count() > 1) + { + serieFilterKeywords = keywords.Count - searchKeywords; + + do + { + var serieFilter = string.Join(" ", keywords.GetRange(searchKeywords, serieFilterKeywords)); + var filteredSeries = series.Where(s => s["title_orig"].Value().Contains(serieFilter)).ToList(); + + if (filteredSeries.Count() > 0) + { + series = filteredSeries; + break; // Serie Filter loop + } + } + while (--serieFilterKeywords > 0); + } + + foreach (var serie in series) + { + var link = serie["link"].ToString(); + var season_url = (season == null) || (season == 0) ? "/seasons" : "/season_" + season.ToString(); + var url = Settings.BaseUrl + link.TrimStart('/') + season_url; + + if (!string.IsNullOrEmpty(episode)) + { + // Fetch single episode releases + // TODO: Add a togglable Quick Path via v_search.php in Indexer Settings + url += "/episode_" + episode; + } + + urls.Add(url); + } + } + while (--searchKeywords > 0); + + return urls; + } + + private IEnumerable GetPagedRequests(string term, int[] categories, int? season, string episode) { - var requestUrl = string.Empty; + var requestUrls = new List(); if (string.IsNullOrWhiteSpace(term)) { - requestUrl = Settings.BaseUrl + "new"; + requestUrls.Add(Settings.BaseUrl + "new"); } else { - throw new Exception("Lostfilm search not implemented"); + requestUrls.AddRange(GetSearchPageURLs(term, season, episode)); + } + + var requests = new List(); + foreach (var url in requestUrls) + { + requests.Add(new IndexerRequest(url, HttpAccept.Html)); } - var request = new IndexerRequest(requestUrl, HttpAccept.Html); - yield return request; + yield return requests; } public IndexerPageableRequestChain GetSearchRequests(MovieSearchCriteria searchCriteria) { var pageableRequests = new IndexerPageableRequestChain(); - pageableRequests.Add(GetPagedRequests(string.Format("{0}", searchCriteria.SanitizedSearchTerm), searchCriteria.Categories)); + pageableRequests.Add(GetPagedRequests(string.Format("{0}", searchCriteria.SanitizedSearchTerm), searchCriteria.Categories, null, "")); return pageableRequests; } @@ -201,7 +343,7 @@ public IndexerPageableRequestChain GetSearchRequests(TvSearchCriteria searchCrit { var pageableRequests = new IndexerPageableRequestChain(); - pageableRequests.Add(GetPagedRequests(string.Format("{0}", searchCriteria.SanitizedTvSearchString), searchCriteria.Categories)); + pageableRequests.Add(GetPagedRequests(string.Format("{0}", searchCriteria.SanitizedTvSearchString), searchCriteria.Categories, searchCriteria.Season, searchCriteria.Episode)); return pageableRequests; } @@ -210,7 +352,7 @@ public IndexerPageableRequestChain GetSearchRequests(BasicSearchCriteria searchC { var pageableRequests = new IndexerPageableRequestChain(); - pageableRequests.Add(GetPagedRequests(string.Format("{0}", searchCriteria.SanitizedSearchTerm), searchCriteria.Categories)); + pageableRequests.Add(GetPagedRequests(string.Format("{0}", searchCriteria.SanitizedSearchTerm), searchCriteria.Categories, null, "")); return pageableRequests; } @@ -461,7 +603,7 @@ private IList ParseRelease(IndexerResponse indexerResponse) return releases; } - public IList ParseResponse(IndexerResponse indexerResponse) + private IList ParseNewResponse(IndexerResponse indexerResponse) { var releases = new List(); @@ -493,6 +635,24 @@ public IList ParseResponse(IndexerResponse indexerResponse) return releases.ToArray(); } + private IList ParseSearchResponse(IndexerResponse indexerResponse) + { + var releases = new List(); + return releases.ToArray(); + } + + public IList ParseResponse(IndexerResponse indexerResponse) + { + if (indexerResponse.Request.Url.Path == "/new") + { + return ParseNewResponse(indexerResponse); + } + else + { + return ParseSearchResponse(indexerResponse); + } + } + public Action, DateTime?> CookiesUpdater { get; set; } private string TrimString(string s, char startChar, char endChar)