Skip to content

Commit

Permalink
Add support for 'knockout' selectors that get filtered from output
Browse files Browse the repository at this point in the history
  • Loading branch information
jimwins committed Mar 29, 2024
1 parent ff4f61c commit 828914a
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 2 deletions.
9 changes: 8 additions & 1 deletion frozen_soup/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional, Union
from typing import Optional, Union, List

import requests

Expand All @@ -15,6 +15,7 @@ def freeze_to_string(
session: Optional[requests.Session] = None,
timeout: Union[float, tuple[float, float], None] = 900.0,
formatter: str = 'html5',
knockouts: Optional[List[str]] = None,
) -> str:
if session is None:
session = requests.Session()
Expand All @@ -23,6 +24,12 @@ def freeze_to_string(

soup = BeautifulSoup(r.text, 'html.parser')

# Process the knockouts first so we don't do any extra work on those
if knockouts is not None:
for selector in knockouts:
for tag in soup.css.select(selector):
tag.decompose()

base_url = url

# Find the first <base href="">, which could follow a <base target="">
Expand Down
11 changes: 10 additions & 1 deletion frozen_soup/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,26 +29,35 @@ def main() -> int:
'-T', '--timeout',
type=float,
default=900.0,
metavar= 'SECONDS',
help='default connect and read timeout in seconds'
)
parser.add_argument(
'--connect-timeout',
type=float,
metavar= 'SECONDS',
help='default connect timeout in seconds (will override --timeout)'
)
parser.add_argument(
'--read-timeout',
type=float,
metavar= 'SECONDS',
help='default read timeout in seconds (will override --timeout)'
)
parser.add_argument(
'--knockout',
action= 'append',
metavar= 'SELECTOR',
help='knock out elements matching the given CSS selector'
)

args = parser.parse_args()

timeout = args.timeout
if (args.connect_timeout or args.read_timeout):
timeout = (args.connect_timeout or timeout, args.read_timeout or timeout)

print(freeze_to_string(args.url, timeout=timeout))
print(freeze_to_string(args.url, timeout=timeout, knockouts=args.knockout))

return 0

Expand Down
47 changes: 47 additions & 0 deletions tests/test_knockout.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import pytest

import requests
from requests_testadapter import TestAdapter, TestSession

from frozen_soup import freeze_to_string

@pytest.fixture
def session() -> requests.Session:
s = TestSession()

s.mount("http://test/content", TestAdapter(
b'/* WONTON */',
headers= { 'Content-type' : 'text/plain' }
))

s.mount(
"http://test/html",
TestAdapter(b'<i class="ko">pow!</i><img src="/content">')
)
s.mount(
"http://test/multiple",
TestAdapter(b'<i class="ko">pow!</i><b class="ko">bang!</b><img src="/content">')
)
s.mount(
"http://test/bad-img",
TestAdapter(b'<i class="ko">pow!</i><img src="/error">')
)

return s

def test_knockout(session):
out = freeze_to_string('http://test/html', session, knockouts=['.ko'])
assert out == '<img src="data:text/plain;base64,LyogV09OVE9OICov">'

def test_knockout_multiple_elements(session):
out = freeze_to_string('http://test/multiple', session, knockouts=['.ko'])
assert out == '<img src="data:text/plain;base64,LyogV09OVE9OICov">'

def test_knockout_multiple_selectors(session):
out = freeze_to_string('http://test/multiple', session, knockouts=['i', 'b'])
assert out == '<img src="data:text/plain;base64,LyogV09OVE9OICov">'

# if the knockout doesn't kill the <img> we'll get an exception
def test_knockout_img(session):
out = freeze_to_string('http://test/bad-img', session, knockouts=['img'])
assert out == '<i class="ko">pow!</i>'

0 comments on commit 828914a

Please sign in to comment.