Merge pull request #1924 from midichef/http_parse

[http] fix parsing link header
saulpw · Jun 20, 2023 · c1fef18 · c1fef18
2 parents fb4352c + 3a8a29a
commit c1fef18
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 5 deletions.
diff --git a/visidata/loaders/http.py b/visidata/loaders/http.py
@@ -1,3 +1,5 @@
+import re
+
 from visidata import Path, RepeatFile, vd, VisiData
 from visidata.loaders.tsv import splitter
 
@@ -47,24 +49,59 @@ def _iter_lines(path=path, response=response, max_next=vd.options.http_max_next)
  linkhdr = response.getheader('Link')
  src = None
  if linkhdr:
- links = urllib.parse.parse_header(linkhdr)
- src = links.get('next', {}).get('url', None)
+ links = parse_header_links(linkhdr)
+ link_data = {}
+ for link in links:
+ key = link.get('rel') or link.get('url')
+ link_data[key] = link
+ src = link_data.get('next', {}).get('url', None)
 
  if not src:
  break
 
  n += 1
  if n > max_next:
- vd.warning(f'stopping at max {max_next} pages')
+ vd.warning(f'stopping at max next pages: {max_next} pages')
  break
 
  vd.status(f'fetching next page from {src}')
- response = requests.get(src, stream=True, **vd.options.getall('http_req_'))
+ req = urllib.request.Request(src, **vd.options.getall('http_req_'))
+ response = urllib.request.urlopen(req)
 
  # add resettable iterator over contents as an already-open fp
  path.fptext = RepeatFile(_iter_lines())
 
  return vd.openSource(path, filetype=filetype)
 
+def parse_header_links(link_header):
+ '''Return a list of dictionaries:
+ [{'url': 'https://example.com/content?page=1', 'rel': 'prev'},
+ {'url': 'https://example.com/content?page=3', 'rel': 'next'}]
+ Takes a link header string, of the form
+ '<https://example.com/content?page=1>; rel="prev", <https://example.com/content?page=3>; rel="next"'
+ See https://datatracker.ietf.org/doc/html/rfc8288#section-3
+ '''
+
+ links = []
+ quote_space = ' \'"'
+ link_header = link_header.strip(quote_space)
+ if not link_header: return []
+ for link_value in re.split(', *<', link_header):
+ if ';' in link_value:
+ url, params = link_value.split(';', maxsplit=1)
+ else:
+ url, params = link_value, ''
+ link = {'url': url.strip('<>' + quote_space)}
+
+ for param in params.split(';'):
+ if '=' in param:
+ key, value = param.split('=')
+ key = key.strip(quote_space)
+ value = value.strip(quote_space)
+ link[key] = value
+ else:
+ break
+ links.append(link)
+ return links
 
 VisiData.openurl_https = VisiData.openurl_http
diff --git a/visidata/plugins.py b/visidata/plugins.py
@@ -5,7 +5,7 @@
 import shutil
 import importlib
 import subprocess
-import urllib
+import urllib.error
 
 from visidata import VisiData, vd, Path, CellColorizer, JsonLinesSheet, AttrDict, Column, Progress, ExpectedException, BaseSheet, asyncsingle, asyncthread