Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scripts/check-more-info-urls.py: add script #12506

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions scripts/README.md
Expand Up @@ -14,6 +14,7 @@ This section contains a summary of the scripts available in this directory. For
- [build-index.sh](build-index.sh) script builds the index of available pages.
- [check-pr.sh](check-pr.sh) script checks the page's syntax and performs various checks on the PR.
- [deploy.sh](deploy.sh) script deploys the ZIP and PDF archives to the static website repository.
- [check-more-info-urls.py](check-more-info-urls.py) is a Python script to check for "More information" links that are broken or redirect to another one, using asynchronous code for speed.
- [send-to-bot.py](send-to-bot.py) is a Python script that sends the build or test output to tldr-bot.
- [set-alias-page.py](set-alias-page.py) is a Python script to generate or update alias pages.
- [set-more-info-link.py](set-more-info-link.py) is a Python script to generate or update more information links across pages.
Expand All @@ -31,6 +32,7 @@ The table below shows the compatibility of user-executable scripts with differen
| [render.py](pdf/render.py) | ✅ | ✅ | ✅ |
| [build-pdf.sh](pdf/build-pdf.sh) | ✅ | ✅ | ❌ (WSL ✅)|
| [build.sh](build.sh) | ✅ | ✅ | ❌ (WSL ✅)|
| [check-more-info-urls.py](check-more-info-urls.py) | ✅ | ✅ | ✅ |
| [set-alias-pages.py](set-alias-pages.py) | ✅ | ✅ | ✅ |
| [set-more-info-link.py](set-more-info-link.py) | ✅ | ✅ | ✅ |
| [set-page-title.py](set-page-title.py) | ✅ | ✅ | ✅ |
Expand Down
152 changes: 152 additions & 0 deletions scripts/check-more-info-urls.py
@@ -0,0 +1,152 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: MIT

"""
A Python script to check for bad (HTTP status code different than 200) "More information" URLs across all pages.

These bad codes typically indicate a page not found or a redirection. They are written to bad-urls.csv with their respective URLs.

Usage:
python3 scripts/check-more-info-urls.py
"""

import re
import asyncio
import aiohttp.client_exceptions
from aioconsole import aprint
from aiofile import AIOFile, Writer
from aiopath import AsyncPath

MAX_CONCURRENCY = 500

sem = asyncio.Semaphore(MAX_CONCURRENCY)


class CodeColors:
OK = "\033[92m" # green
WARNING = "\033[93m" # yellow
ERROR = "\033[91m" # red
TOO_MANY_REQUESTS = "\033[35m" # bold
UNKNOWN = "\033[4m" # underline
RESET = "\033[0m" # reset to no formatting


async def find_all_pages(pages_path: AsyncPath) -> list[AsyncPath]:
"""Find all pages (*.md files) of all platforms in the given pages path."""
return [page async for page in pages_path.glob("*/*.md")]


async def parse_and_make_request(
page_path: AsyncPath,
writer: Writer,
output_file: AsyncPath,
session: aiohttp.ClientSession,
) -> None:
"""Parse the URL of a single page and write it to the output file if it is bad."""
async with sem:
async with page_path.open("r") as page:
try:
page_content = await page.read()
except Exception as exc:
await aprint(
f"{CodeColors.ERROR}Error: {exc}, File: {page.parts[-3:]}{CodeColors.RESET}"
)
return

url = parse_url(page_content)

if url is not None:
await make_request_and_write_if_bad(url, writer, output_file, session)


def parse_url(page_content: str) -> list[str]:
"""Parse the URL of '> More information: ' from the page content."""
return next(
(
match.group(1)
for match in re.finditer(r"> More information: <(.+)>", page_content)
),
None,
)


async def aprint_colored_status_code_and_url(code: int, url: str) -> None:
"""Print the properly colored status code along with its URL."""
color = CodeColors.RESET
match code:
case 200:
color = CodeColors.OK
case 404:
color = CodeColors.ERROR
case 301:
color = CodeColors.WARNING
case 301 | 429 | 504 | -1:
color = CodeColors.TOO_MANY_REQUESTS
case _:
color = CodeColors.UNKNOWN
await aprint(f"{color}{code}{CodeColors.RESET} {url}")


async def make_request_and_write_if_bad(
url: str, writer: Writer, output_file: AsyncPath, session: aiohttp.ClientSession
) -> None:
"""Make an HTTP request and write the HTTP status code to the output file if it is bad."""
await aprint(f"??? {url}")
code = -1
try:
code = await get_url_status_code(url, session)
except aiohttp.ClientError as exc:
if hasattr(exc, "strerr"):
await aprint(f"\033[31m{exc.strerr}\033[0m")
if hasattr(exc, "message"):
await aprint(f"\033[31m{exc.message}\033[0m")
else:
await aprint(f"\033[31m{exc}\033[0m")
await aprint_colored_status_code_and_url(code, url)

if code != 200:
await writer(f'{code},"{url}"\n')


async def get_url_status_code(url: str, session: aiohttp.ClientSession) -> int:
"""Make an HTTP request to a URL and return its status code."""
async with session.head(url) as response:
return response.status


async def parse_urls_and_write_if_bad(
output_file: AsyncPath, pages: list[AsyncPath]
) -> None:
"""Parse all URLs, print their status codes, and write the bad ones to the output file."""
async with AIOFile(output_file.name, "a") as afp:
writer = Writer(afp)
async with aiohttp.ClientSession(
trust_env=True, timeout=aiohttp.ClientTimeout(total=500)
) as session:
await asyncio.gather(
*(
parse_and_make_request(page_path, writer, output_file, session)
for page_path in pages
)
)
await afp.fsync()


async def parse_and_write_bad_urls(
output_file: AsyncPath, pages_path: str = "./pages"
) -> None:
"""Parse all "More information" URLs, print all, and write the ones with bad status codes (!= 200) to a CSV file."""
pages_path = AsyncPath(pages_path)
await aprint("Getting the pages of all platforms...")
pages = await find_all_pages(pages_path)
await aprint("Found all pages!")

await parse_urls_and_write_if_bad(output_file, pages)


async def main() -> None:
await parse_and_write_bad_urls(AsyncPath("bad-urls.csv"))


if __name__ == "__main__":
asyncio.run(main())