tldr-pages · vitorhcl · Mar 14, 2024 · Mar 14, 2024 · Mar 15, 2024 · Apr 3, 2024
@@ -14,6 +14,7 @@ This section contains a summary of the scripts available in this directory. For
 - [build-index.sh](build-index.sh) script builds the index of available pages.
 - [check-pr.sh](check-pr.sh) script checks the page's syntax and performs various checks on the PR.
 - [deploy.sh](deploy.sh) script deploys the ZIP and PDF archives to the static website repository.
+- [check-more-info-urls.py](check-more-info-urls.py) is a Python script to check for "More information" links that are broken or redirect to another one, using asynchronous code for speed.
 - [send-to-bot.py](send-to-bot.py) is a Python script that sends the build or test output to tldr-bot.
 - [set-alias-page.py](set-alias-page.py) is a Python script to generate or update alias pages.
 - [set-more-info-link.py](set-more-info-link.py) is a Python script to generate or update more information links across pages.
@@ -31,6 +32,7 @@ The table below shows the compatibility of user-executable scripts with differen
 | [render.py](pdf/render.py) | ✅ | ✅ | ✅ |
 | [build-pdf.sh](pdf/build-pdf.sh) | ✅ | ✅ | ❌ (WSL ✅)|
 | [build.sh](build.sh) | ✅ | ✅ | ❌ (WSL ✅)|
+| [check-more-info-urls.py](check-more-info-urls.py) | ✅ | ✅ | ✅ |
 | [set-alias-pages.py](set-alias-pages.py) | ✅ | ✅ | ✅ |
 | [set-more-info-link.py](set-more-info-link.py) | ✅ | ✅ | ✅ |
 | [set-page-title.py](set-page-title.py) | ✅ | ✅ | ✅ |

@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+
+"""
+A Python script to check for bad (HTTP status code different than 200) "More information" URLs across all pages.
+
+These bad codes typically indicate a page not found or a redirection. They are written to bad-urls.csv with their respective URLs.
+
+Usage:
+ python3 scripts/check-more-info-urls.py
+"""
+
+import re
+import asyncio
+import aiohttp.client_exceptions
+from aioconsole import aprint
+from aiofile import AIOFile, Writer
+from aiopath import AsyncPath
+
+MAX_CONCURRENCY = 500
+
+sem = asyncio.Semaphore(MAX_CONCURRENCY)
+
+
+class CodeColors:
+ OK = "\033[92m" # green
+ WARNING = "\033[93m" # yellow
+ ERROR = "\033[91m" # red
+ TOO_MANY_REQUESTS = "\033[35m" # bold
+ UNKNOWN = "\033[4m" # underline
+ RESET = "\033[0m" # reset to no formatting
+
+
+async def find_all_pages(pages_path: AsyncPath) -> list[AsyncPath]:
+ """Find all pages (*.md files) of all platforms in the given pages path."""
+ return [page async for page in pages_path.glob("*/*.md")]
+
+
+async def parse_and_make_request(
+ page_path: AsyncPath,
+ writer: Writer,
+ output_file: AsyncPath,
+ session: aiohttp.ClientSession,
+) -> None:
+ """Parse the URL of a single page and write it to the output file if it is bad."""
+ async with sem:
+ async with page_path.open("r") as page:
+ try:
+ page_content = await page.read()
+ except Exception as exc:
+ await aprint(
+ f"{CodeColors.ERROR}Error: {exc}, File: {page.parts[-3:]}{CodeColors.RESET}"
+ )
+ return
+
+ url = parse_url(page_content)
+
+ if url is not None:
+ await make_request_and_write_if_bad(url, writer, output_file, session)
+
+
+def parse_url(page_content: str) -> list[str]:
+ """Parse the URL of '> More information: ' from the page content."""
+ return next(
+ (
+ match.group(1)
+ for match in re.finditer(r"> More information: <(.+)>", page_content)
+ ),
+ None,
+ )
+
+
+async def aprint_colored_status_code_and_url(code: int, url: str) -> None:
+ """Print the properly colored status code along with its URL."""
+ color = CodeColors.RESET
+ match code:
+ case 200:
+ color = CodeColors.OK
+ case 404:
+ color = CodeColors.ERROR
+ case 301:
+ color = CodeColors.WARNING
+ case 301 | 429 | 504 | -1:
+ color = CodeColors.TOO_MANY_REQUESTS
+ case _:
+ color = CodeColors.UNKNOWN
+ await aprint(f"{color}{code}{CodeColors.RESET} {url}")
+
+
+async def make_request_and_write_if_bad(
+ url: str, writer: Writer, output_file: AsyncPath, session: aiohttp.ClientSession
+) -> None:
+ """Make an HTTP request and write the HTTP status code to the output file if it is bad."""
+ await aprint(f"??? {url}")
+ code = -1
+ try:
+ code = await get_url_status_code(url, session)
+ except aiohttp.ClientError as exc:
+ if hasattr(exc, "strerr"):
+ await aprint(f"\033[31m{exc.strerr}\033[0m")
+ if hasattr(exc, "message"):
+ await aprint(f"\033[31m{exc.message}\033[0m")
+ else:
+ await aprint(f"\033[31m{exc}\033[0m")
+ await aprint_colored_status_code_and_url(code, url)
+
+ if code != 200:
+ await writer(f'{code},"{url}"\n')
+
+
+async def get_url_status_code(url: str, session: aiohttp.ClientSession) -> int:
+ """Make an HTTP request to a URL and return its status code."""
+ async with session.head(url) as response:
+ return response.status
+
+
+async def parse_urls_and_write_if_bad(
+ output_file: AsyncPath, pages: list[AsyncPath]
+) -> None:
+ """Parse all URLs, print their status codes, and write the bad ones to the output file."""
+ async with AIOFile(output_file.name, "a") as afp:
+ writer = Writer(afp)
+ async with aiohttp.ClientSession(
+ trust_env=True, timeout=aiohttp.ClientTimeout(total=500)
+ ) as session:
+ await asyncio.gather(
+ *(
+ parse_and_make_request(page_path, writer, output_file, session)
+ for page_path in pages
+ )
+ )
+ await afp.fsync()
+
+
+async def parse_and_write_bad_urls(
+ output_file: AsyncPath, pages_path: str = "./pages"
+) -> None:
+ """Parse all "More information" URLs, print all, and write the ones with bad status codes (!= 200) to a CSV file."""
+ pages_path = AsyncPath(pages_path)
+ await aprint("Getting the pages of all platforms...")
+ pages = await find_all_pages(pages_path)
+ await aprint("Found all pages!")
+
+ await parse_urls_and_write_if_bad(output_file, pages)
+
+
+async def main() -> None:
+ await parse_and_write_bad_urls(AsyncPath("bad-urls.csv"))
+
+
+if __name__ == "__main__":
+ asyncio.run(main())