Skip to content

Commit

Permalink
Merge pull request #1924 from midichef/http_parse
Browse files Browse the repository at this point in the history
[http] fix parsing link header
  • Loading branch information
anjakefala committed Jun 20, 2023
2 parents fb4352c + 3a8a29a commit c1fef18
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 5 deletions.
45 changes: 41 additions & 4 deletions visidata/loaders/http.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

from visidata import Path, RepeatFile, vd, VisiData
from visidata.loaders.tsv import splitter

Expand Down Expand Up @@ -47,24 +49,59 @@ def _iter_lines(path=path, response=response, max_next=vd.options.http_max_next)
linkhdr = response.getheader('Link')
src = None
if linkhdr:
links = urllib.parse.parse_header(linkhdr)
src = links.get('next', {}).get('url', None)
links = parse_header_links(linkhdr)
link_data = {}
for link in links:
key = link.get('rel') or link.get('url')
link_data[key] = link
src = link_data.get('next', {}).get('url', None)

if not src:
break

n += 1
if n > max_next:
vd.warning(f'stopping at max {max_next} pages')
vd.warning(f'stopping at max next pages: {max_next} pages')
break

vd.status(f'fetching next page from {src}')
response = requests.get(src, stream=True, **vd.options.getall('http_req_'))
req = urllib.request.Request(src, **vd.options.getall('http_req_'))
response = urllib.request.urlopen(req)

# add resettable iterator over contents as an already-open fp
path.fptext = RepeatFile(_iter_lines())

return vd.openSource(path, filetype=filetype)

def parse_header_links(link_header):
'''Return a list of dictionaries:
[{'url': 'https://example.com/content?page=1', 'rel': 'prev'},
{'url': 'https://example.com/content?page=3', 'rel': 'next'}]
Takes a link header string, of the form
'<https://example.com/content?page=1>; rel="prev", <https://example.com/content?page=3>; rel="next"'
See https://datatracker.ietf.org/doc/html/rfc8288#section-3
'''

links = []
quote_space = ' \'"'
link_header = link_header.strip(quote_space)
if not link_header: return []
for link_value in re.split(', *<', link_header):
if ';' in link_value:
url, params = link_value.split(';', maxsplit=1)
else:
url, params = link_value, ''
link = {'url': url.strip('<>' + quote_space)}

for param in params.split(';'):
if '=' in param:
key, value = param.split('=')
key = key.strip(quote_space)
value = value.strip(quote_space)
link[key] = value
else:
break
links.append(link)
return links

VisiData.openurl_https = VisiData.openurl_http
2 changes: 1 addition & 1 deletion visidata/plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import shutil
import importlib
import subprocess
import urllib
import urllib.error

from visidata import VisiData, vd, Path, CellColorizer, JsonLinesSheet, AttrDict, Column, Progress, ExpectedException, BaseSheet, asyncsingle, asyncthread

Expand Down

0 comments on commit c1fef18

Please sign in to comment.