moved to headless chrome

fcavallarin · Nov 14, 2018 · 4e98aa1 · 4e98aa1
1 parent dcc0078
commit 4e98aa1
Show file tree

Hide file tree

Showing 21 changed files with 3,154 additions and 34 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,8 @@
 *.py[cod]
 tmp.py
-.idea
 *.db
 tmp
+node_modules/
+.*
+!/.gitignore
+package-lock.json
diff --git a/README.md b/README.md
@@ -3,30 +3,30 @@
 Htcap is a web application scanner able to crawl single page application (SPA) in a recursive manner by intercepting ajax calls and DOM changes. 
 Htcap is not just another vulnerability scanner since it's focused mainly on the crawling process and uses external tools to discover vulnerabilities. It's designed to be a tool for both manual and automated penetration test of modern web applications.
 
+This is the very first release that uses headless chrome instead of phantomjs. 
+Htcap’s Javascript crawling engine has been rewritten to take advantage of the new async/await features of ecmascript and has been converted to a nodjes module build on top of [Puppetteer](https://github.com/GoogleChrome/puppeteer).
+
 More infos at [htcap.org](http://htcap.org).
 
 ## SETUP
 
 ### Requirements
 
  1. Python 2.7
- 2. PhantomJS v2
+ 2. Nodejs and npm
  3. Sqlmap (for sqlmap scanner module)
  4. Arachni (for arachni scanner module)
 
 ### Download and Run
 
 ```console
-$ git clone https://github.com/segment-srl/htcap.git htcap
+$ git clone https://github.com/fcavallarin/htcap.git htcap
 $ htcap/htcap.py
 ```
 
-PhantomJs can be downloaded [here](http://phantomjs.org//download.html). It comes as a self-contained executable with all libraries linked statically, so there is no need to install or compile anything else. 
-
-
 ## DOCUMENTATION
 
-Documentation, examples and demos can be found at the official website [http://htcap.org](http://htcap.org).
+Documentation, examples and demos can be found at the official website [https://htcap.org](https://htcap.org).
 
 ## LICENSE
 

diff --git a/core/crawl/crawler.py b/core/crawl/crawler.py
@@ -59,7 +59,7 @@ def __init__(self, argv):
  self.request_patterns = []
 
  self.defaults = {
- "useragent": 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
+ "useragent": 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3582.0 Safari/537.36',
  "num_threads": 10,
  "max_redirects": 10,
  "out_file_overwrite": False,
@@ -75,7 +75,9 @@ def __init__(self, argv):
  "max_post_depth": 10,
  "override_timeout_functions": True,
  'crawl_forms': True, # only if mode == CRAWLMODE_AGGRESSIVE
- 'deduplicate_pages': True
+ 'deduplicate_pages': True,
+ 'use_legacy_browser': False,
+ 'headless_chrome': True
  }
 
 
@@ -123,6 +125,8 @@ def usage(self):
  " -O dont't override timeout functions (setTimeout, setInterval)\n"
  " -K keep elements in the DOM (prevent removal)\n"
  " -e disable hEuristic page deduplication\n"
+ " -L use Legacy browser (phantomjs) instead of chrome\n"
+ " -l do not run chrome in headless mode\n"
  )
 
 
@@ -373,10 +377,11 @@ def main(self, argv):
  Shared.th_condition = threading.Condition()
  Shared.main_condition = threading.Condition()
 
-
- probe_cmd = get_phantomjs_cmd()
- if not probe_cmd:
- print "Error: unable to find phantomjs executable"
+ deps_errors = check_dependences(self.base_dir)
+ if len(deps_errors) > 0:
+ print "Dependences errors: "
+ for err in deps_errors:
+ print " %s" % err
  sys.exit(1)
 
  start_cookies = []
@@ -398,7 +403,7 @@ def main(self, argv):
  user_script = None
 
  try:
- opts, args = getopt.getopt(argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:Ovu:e')
+ opts, args = getopt.getopt(argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:Ovu:eLl')
  except getopt.GetoptError as err:
  print str(err)
  sys.exit(1)
@@ -491,6 +496,15 @@ def main(self, argv):
  sys.exit(1)
  elif o == "-e":
  Shared.options['deduplicate_pages'] = False
+ elif o == "-L":
+ Shared.options['use_legacy_browser'] = True
+ elif o == "-l":
+ Shared.options['headless_chrome'] = False 
+
+ probe_cmd = get_phantomjs_cmd() if Shared.options['use_legacy_browser'] else get_node_cmd() 
+ if not probe_cmd: # maybe useless
+ print "Error: unable to find node (or phantomjs) executable"
+ sys.exit(1)
 
 
  if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len(Shared.allowed_domains) > 0:
@@ -508,11 +522,18 @@ def main(self, argv):
  if Shared.options['mode'] == CRAWLMODE_PASSIVE:
  probe_options.append("-t") # dont trigger events
 
- if Shared.options['proxy']:
- probe_cmd.append("--proxy-type=%s" % Shared.options['proxy']['proto'])
- probe_cmd.append("--proxy=%s:%s" % (Shared.options['proxy']['host'], Shared.options['proxy']['port']))
 
- probe_cmd.append(self.base_dir + 'probe/analyze.js')
+ if Shared.options['use_legacy_browser']:
+ if Shared.options['proxy']:
+ probe_cmd.append("--proxy-type=%s" % Shared.options['proxy']['proto'])
+ probe_cmd.append("--proxy=%s:%s" % (Shared.options['proxy']['host'], Shared.options['proxy']['port']))
+ probe_cmd.append(self.base_dir + 'probe/analyze.js')
+ else:
+ if Shared.options['proxy']:
+ probe_options.extend(["-y", "%s:%s:%s" % (Shared.options['proxy']['proto'], Shared.options['proxy']['host'], Shared.options['proxy']['port'])])
+ if not Shared.options['headless_chrome']:
+ probe_options.append("-l")
+ probe_cmd.append(self.base_dir + 'probe/chrome-probe/analyze.js')
 
 
  if len(Shared.excluded_urls) > 0:

diff --git a/core/crawl/crawler_thread.py b/core/crawl/crawler_thread.py
@@ -98,7 +98,7 @@ def load_probe_json(self, jsn):
  try:
  return json.loads(jsn)
  except Exception:
- #print "-- JSON DECODE ERROR %s" % jsn
+ print "-- JSON DECODE ERROR %s" % jsn
  raise
 
 
@@ -146,15 +146,16 @@ def send_probe(self, request, errors):
 
  # print cmd_to_str(Shared.probe_cmd + params)
  # print ""
+
+ cmd = CommandExecutor(Shared.probe_cmd + params, True)
+ jsn, err = cmd.execute(Shared.options['process_timeout'] + 2)
 
- cmd = CommandExecutor(Shared.probe_cmd + params)
- jsn = cmd.execute(Shared.options['process_timeout'] + 2)
+ if err:
+ print err
+ jsn = None
 
  if jsn == None:
  errors.append(ERROR_PROBEKILLED)
- # time.sleep(self.process_retries_interval) # ... ???
- # retries -= 1
- # continue
  break
 
 

diff --git a/core/crawl/lib/utils.py b/core/crawl/lib/utils.py
@@ -97,8 +97,8 @@ def request_is_crawlable(request):
  return False
 
  types = [REQTYPE_LINK, REQTYPE_REDIRECT]
- if Shared.options['mode'] == CRAWLMODE_AGGRESSIVE and Shared.options['crawl_forms']:
- types.append(REQTYPE_FORM)
+ # if Shared.options['mode'] == CRAWLMODE_AGGRESSIVE and Shared.options['crawl_forms']:
+ #  types.append(REQTYPE_FORM)
 
  return request.type in types and re.match("^https?://", request.url, re.I)
 

diff --git a/core/crawl/probe/chrome-probe/analyze.js b/core/crawl/probe/chrome-probe/analyze.js
@@ -0,0 +1,167 @@
+/*
+HTCAP - 1.2
+http://htcap.org
+Author: [email protected]
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 2 of the License, or (at your option) any later
+version.
+*/
+
+
+"use strict";
+
+const htcap = require("./htcap");
+const utils = require('./utils');
+const process = require('process');
+
+
+var sleep = function(n){
+ return new Promise(resolve => {
+ setTimeout(resolve, n);
+ });
+};
+
+
+
+var argv = utils.parseArgs(process.argv, "hVaftUJdICc:MSEp:Tsx:A:r:mHX:PD:R:Oi:u:vy:l", {});
+var options = argv.opts
+
+var targetUrl = argv.args[0];
+
+
+
+if(!targetUrl){
+ utils.usage();
+ process.exit(-1);
+}
+
+targetUrl = targetUrl.trim();
+if(targetUrl.length < 4 || targetUrl.substring(0,4).toLowerCase() != "http"){
+ targetUrl = "http://" + targetUrl;
+}
+
+
+
+htcap.launch(targetUrl, options).then( crawler => {
+ const page = crawler.page();
+ var execTO = null;
+
+ console.log("[");
+
+ function exit(){
+ clearTimeout(execTO);
+ crawler.browser().close(); 
+ }
+
+ crawler.on("redirect", async function(e, crawler){
+ // console.log(crawler.redirect());
+ // console.log(e.params.url);
+ // utils.printCookies(crawler);
+ // utils.printRequest({type:'link',method:"GET",url:e.params.url});
+ // exit();
+ });
+
+
+ crawler.on("domcontentloaded", async function(e, crawler){
+ //utils.printCookies(crawler);
+ await utils.printLinks("html", crawler.page())
+ await utils.printForms("html", crawler.page())
+
+ //await sleep(4000)
+ });
+
+ crawler.on("start", function(e, crawler){
+ //console.log("--->Start"); 
+ })
+
+
+ crawler.on("newdom", async function(e, crawler){
+ //console.log(e.params)
+ })
+
+ crawler.on("xhr", async function(e, crawler){
+ utils.printRequest(e.params.request)
+
+ //return false
+ });
+
+ crawler.on("xhrCompleted", function(e, crawler){
+ //console.log("XHR completed")
+ });
+
+
+ crawler.on("jsonp", function(e, crawler){
+ utils.printRequest(e.params.request)
+ });
+
+ crawler.on("jsonpCompleted", function(e, crawler){
+
+ });
+
+ crawler.on("websocket", function(e, crawler){
+ utils.printRequest(e.params.request)
+ });
+
+ crawler.on("websocketMessage", function(e, crawler){
+
+ });
+
+ crawler.on("websocketSend", function(e, crawler){
+
+ });
+
+ crawler.on("formSubmit", function(e, crawler){
+ utils.printRequest(e.params.request)
+ });
+
+ crawler.on("navigation", function(e, crawler){
+ e.params.request.type="link";
+ utils.printRequest(e.params.request)
+ });
+
+ crawler.on("eventtriggered", function(e, crawler){ 
+ //console.log(e.params)
+ });
+
+ crawler.on("triggerevent", function(e, crawler){ 
+ //console.log(e.params)
+ });
+
+ crawler.on("earlydetach", function(e, crawler){ 
+ //console.log('["warning","earlydetach of element ' + e.params.node + '],')
+ //crawler.browser().close();
+ });
+
+
+ async function end(){
+ if(!crawler.redirect()){
+ const el = await crawler.page().$("html");
+ const v = await el.getProperty('innerText');
+ const hash = await v.jsonValue();
+ var json = '["page_hash",' + JSON.stringify(hash) + '],';
+ console.log(json);
+
+ if(options.returnHtml){
+ json = '["html",' + JSON.stringify(hash) + '],';
+ console.log(json);
+ }
+ }
+
+ utils.printStatus(crawler);
+ exit(); 
+ }
+
+ crawler.on("end", end);
+
+ execTO = setTimeout(function(){ // (very dirty solution)
+ crawler.on("end", function(){});
+ crawler.errors().push(["probe_timeout", "maximum execution time reached"]);
+ end();
+ }, options.maxExecTime);
+
+
+ crawler.start() 
+
+})
diff --git a/core/crawl/probe/chrome-probe/ckdeps.js b/core/crawl/probe/chrome-probe/ckdeps.js
@@ -0,0 +1,5 @@
+try{
+ require("puppeteer")
+} catch(e) {
+ console.log("puppeteer")
+}