Fixed sitemap.xml parsing (Fixes #107)

s0md3v · Jan 25, 2019 · c1fb0cf · c1fb0cf
1 parent c019a04
commit c1fb0cf
Showing 1 changed file with 4 additions and 4 deletions.
diff --git a/photon.py b/photon.py
@@ -341,7 +341,7 @@ def xmlParser(response):
  return findall(r'<loc>(.*?)</loc>', response)
 
 
-def zap(url):
+def zap(inputUrl):
  """Extract links from robots.txt and sitemap.xml."""
  if args.archive:
  from plugins.wayback import time_machine
@@ -356,7 +356,7 @@ def zap(url):
  verb('Internal page', url)
  internal.add(url)
  # Makes request to robots.txt
- response = requests.get(url + '/robots.txt', verify=False).text
+ response = requests.get(inputUrl + '/robots.txt', verify=False).text
  # Making sure robots.txt isn't some fancy 404 page
  if '<body' not in response:
  # If you know it, you know it
@@ -376,15 +376,15 @@ def zap(url):
  robots.add(url)
  print('%s URLs retrieved from robots.txt: %s' % (good, len(robots)))
  # Makes request to sitemap.xml
- response = requests.get(url + '/sitemap.xml', verify=False).text
+ response = requests.get(inputUrl + '/sitemap.xml', verify=False).text
  # Making sure robots.txt isn't some fancy 404 page
  if '<body' not in response:
  matches = xmlParser(response)
  if matches: # if there are any matches
  print('%s URLs retrieved from sitemap.xml: %s' % (
  good, len(matches)))
  for match in matches:
- verb('Internal page', url)
+ verb('Internal page', match)
  # Cleaning up the URL and adding it to the internal list for
  # crawling
  internal.add(match)