diff --git a/frozen_soup/__init__.py b/frozen_soup/__init__.py index 543e161..f252f7a 100644 --- a/frozen_soup/__init__.py +++ b/frozen_soup/__init__.py @@ -1,4 +1,4 @@ -from typing import Optional, Union +from typing import Optional, Union, List import requests @@ -15,6 +15,7 @@ def freeze_to_string( session: Optional[requests.Session] = None, timeout: Union[float, tuple[float, float], None] = 900.0, formatter: str = 'html5', + knockouts: Optional[List[str]] = None, ) -> str: if session is None: session = requests.Session() @@ -23,6 +24,12 @@ def freeze_to_string( soup = BeautifulSoup(r.text, 'html.parser') + # Process the knockouts first so we don't do any extra work on those + if knockouts is not None: + for selector in knockouts: + for tag in soup.css.select(selector): + tag.decompose() + base_url = url # Find the first , which could follow a diff --git a/frozen_soup/__main__.py b/frozen_soup/__main__.py index 6c553a9..2d6b25b 100644 --- a/frozen_soup/__main__.py +++ b/frozen_soup/__main__.py @@ -29,18 +29,27 @@ def main() -> int: '-T', '--timeout', type=float, default=900.0, + metavar= 'SECONDS', help='default connect and read timeout in seconds' ) parser.add_argument( '--connect-timeout', type=float, + metavar= 'SECONDS', help='default connect timeout in seconds (will override --timeout)' ) parser.add_argument( '--read-timeout', type=float, + metavar= 'SECONDS', help='default read timeout in seconds (will override --timeout)' ) + parser.add_argument( + '--knockout', + action= 'append', + metavar= 'SELECTOR', + help='knock out elements matching the given CSS selector' + ) args = parser.parse_args() @@ -48,7 +57,7 @@ def main() -> int: if (args.connect_timeout or args.read_timeout): timeout = (args.connect_timeout or timeout, args.read_timeout or timeout) - print(freeze_to_string(args.url, timeout=timeout)) + print(freeze_to_string(args.url, timeout=timeout, knockouts=args.knockout)) return 0 diff --git a/tests/test_knockout.py b/tests/test_knockout.py new file mode 100644 index 0000000..7e94252 --- /dev/null +++ b/tests/test_knockout.py @@ -0,0 +1,47 @@ +import pytest + +import requests +from requests_testadapter import TestAdapter, TestSession + +from frozen_soup import freeze_to_string + +@pytest.fixture +def session() -> requests.Session: + s = TestSession() + + s.mount("http://test/content", TestAdapter( + b'/* WONTON */', + headers= { 'Content-type' : 'text/plain' } + )) + + s.mount( + "http://test/html", + TestAdapter(b'pow!') + ) + s.mount( + "http://test/multiple", + TestAdapter(b'pow!bang!') + ) + s.mount( + "http://test/bad-img", + TestAdapter(b'pow!') + ) + + return s + +def test_knockout(session): + out = freeze_to_string('http://test/html', session, knockouts=['.ko']) + assert out == '' + +def test_knockout_multiple_elements(session): + out = freeze_to_string('http://test/multiple', session, knockouts=['.ko']) + assert out == '' + +def test_knockout_multiple_selectors(session): + out = freeze_to_string('http://test/multiple', session, knockouts=['i', 'b']) + assert out == '' + +# if the knockout doesn't kill the we'll get an exception +def test_knockout_img(session): + out = freeze_to_string('http://test/bad-img', session, knockouts=['img']) + assert out == 'pow!'