-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
68 lines (52 loc) · 2.43 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from flask import Flask, render_template, send_file, request
from flask_socketio import SocketIO, emit
import requests
from bs4 import BeautifulSoup
from io import BytesIO
from zipfile import ZipFile
from urllib.parse import urlparse, urljoin
app = Flask(__name__)
socketio = SocketIO(app)
currently_crawling_pages = set()
def crawl_web(url, depth=2):
visited_pages = set()
base_url = urlparse(url).scheme + '://' + urlparse(url).hostname
website_name = urlparse(url).hostname.split('.')[0] # Extract website name from the URL
zip_buffer = BytesIO()
with ZipFile(zip_buffer, 'a') as zip_file:
def recursive_crawl(current_url, current_depth):
if current_url in visited_pages or current_url in currently_crawling_pages or current_depth > depth:
return
currently_crawling_pages.add(current_url)
try:
with requests.get(current_url, stream=True) as response:
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
for link in soup.find_all('a'):
next_url = link.get('href')
if next_url:
absolute_url = urljoin(base_url, next_url)
if absolute_url.startswith(base_url):
recursive_crawl(absolute_url, current_depth + 1)
content = response.content
file_name = current_url.rsplit('/', 1)[-1].replace('/', '_') + ".html"
zip_file.writestr(file_name, content)
except requests.exceptions.RequestException:
pass
finally:
currently_crawling_pages.remove(current_url)
visited_pages.add(current_url)
socketio.emit('update_pages', list(currently_crawling_pages))
recursive_crawl(url, 1)
return zip_buffer.getvalue(), website_name
@app.route('/')
def index():
return render_template('index.html')
@app.route('/crawl', methods=['POST'])
def crawl():
url_to_crawl = request.form['url']
depth = int(request.form['depth'])
zip_data, website_name = crawl_web(url_to_crawl, depth)
return send_file(BytesIO(zip_data), download_name=f'Th3-{website_name}.zip', as_attachment=True)
if __name__ == '__main__':
socketio.run(app, debug=True)