Skip to content

Commit

Permalink
Merge pull request #1 from moebiuszed/css_selector
Browse files Browse the repository at this point in the history
Css selector
  • Loading branch information
Jaime-alv committed Sep 20, 2021
2 parents cc32bd1 + 07c2483 commit d3d9e53
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 45 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# personal url list
storage/
test_list.txt

# pycharm files
.idea/
Expand Down
63 changes: 43 additions & 20 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#! python3
# Copyright 2021 Jaime Álvarez Fernández
import pathlib
import bs4
import requests
import webbrowser
import filecmp
Expand All @@ -16,50 +17,72 @@ def main():
if len(passed_argument) > 1:
logging.debug(f'argument from sys {passed_argument}')
for n in range(1, len(passed_argument)):
add_url.main(passed_argument[n], 'storage')
add_url.main(passed_argument[n], None, 'storage')
try:
with pathlib.Path('storage\\url_list.txt').open('r') as file:
list_of_saved_url = json.load(file)
for each_url in list_of_saved_url:
file_name = list_of_saved_url[each_url]['file_name']
css_selector = list_of_saved_url[each_url]['css_selector']
logging.debug(f'url = {each_url}')
logging.debug(f'file_name = {file_name}')
logging.debug(f'selector = {css_selector}')
compare_url(each_url, file_name, css_selector)
except FileNotFoundError:
logging.error('Running setup.py')
setup.setup()
for each_url in list_of_saved_url['url']:
file_name = list_of_saved_url['url'][each_url]
compare_url(each_url, file_name)


# compare to a saved version
def compare_url(url, file_name):
def compare_url(url, file_name, css_selector):
new_url = requests.get(url)
path = f'storage\\url_data\\{file_name}'
temp_file = pathlib.Path('storage\\temp.txt').open('wb')
for chunk in new_url.iter_content(10000):
temp_file.write(chunk)
temp_file.close()
path = f'storage\\url_data\\{file_name}.txt'
if css_selector is not None:
temp_file = pathlib.Path('storage\\temp.txt').open('w', encoding='utf-8')
bs4_object = bs4.BeautifulSoup(new_url.text, features="html.parser")
parsed_element = bs4_object.select(css_selector)
temp_file.write(str(parsed_element[0].get_text()))
temp_file.close()
elif css_selector is None:
temp_file = pathlib.Path('storage\\temp.txt').open('wb')
for chunk in new_url.iter_content(10000):
temp_file.write(chunk)
temp_file.close()
compare_files = filecmp.cmp('storage\\temp.txt', path, shallow=False)
if compare_files:
temp_file.close()
logging.critical(f"Equal to stored one")
logging.warning(f"{url} Equal to stored one")
elif not compare_files:
logging.critical(f'Opening {url}. Differences found.')
webbrowser.open(url)
save_url(url, path)
save_url(url, path, css_selector)


# update the saved version
def save_url(url, path):
open_url = pathlib.Path(path).open('wb')
new_content_for_url = requests.get(url)
for chunk in new_content_for_url.iter_content(10000):
open_url.write(chunk)
open_url.close()
def save_url(url, path, css_selector):
logging.warning(f'Updating file with {url} in {path}')
if css_selector is not None:
new_url = requests.get(url)
open_old_url = pathlib.Path(path).open('w', encoding='utf-8')
bs4_object = bs4.BeautifulSoup(new_url.text, features="html.parser")
parsed_element = bs4_object.select(css_selector)
open_old_url.write(str(parsed_element[0].get_text()))
open_old_url.close()

elif css_selector is None:
open_url = pathlib.Path(path).open('wb')
new_content_for_url = requests.get(url)
for chunk in new_content_for_url.iter_content(10000):
open_url.write(chunk)
open_url.close()


if __name__ == "__main__":
try:
logging.basicConfig(filename='storage\\logging\\log.txt', level=logging.DEBUG,
format='%(levelname)s - %(message)s')
pathlib.Path('storage\\logging\\log.txt').open('w')
except FileNotFoundError:
setup.setup()
pathlib.Path('storage\\logging\\log.txt').open('w')
logging.debug(pathlib.Path.cwd())
logging.debug('main function')
main()
53 changes: 33 additions & 20 deletions modules/add_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,42 +5,50 @@
import requests
import json
import logging
import bs4


# TODO: check if given url is valid or not
# TODO: check if given url is already in json file
# TODO: add url and path file to json
# TODO: return to main.py
def domain_name(url):
name = re.compile(r'(http(s)?://)?(www\.)?(?P<domain>.*)(\.(es|com))(/((?P<header>(.*))[/.:]))?')
name = re.compile(r'(http(s)?://)?(www\.)?(?P<domain>.*)\.(([a-zA-Z]+)(/((?P<header>(.*))[/.:]))?)')
seek_name = name.search(url)
return seek_name.group('domain'), seek_name.group('header')


# json = {'url': {'https://www.correos.com' : 'correos' }}
# json = {'url': {'name' : 'http://'}}


def main(url, root):
def main(url, css_selector, root):
logging.critical(f'passed url: {url}')
try:
try: # check if given url is valid or not
requests.get(url).raise_for_status()
with pathlib.Path(f'{root}\\url_list.txt').open('r') as f:
list_of_saved_url = json.load(f)
if list_of_saved_url['url'].get(url, None) is None:
# check if given url is already in json file
if list_of_saved_url.get(url, None) is None:
response = requests.get(url)
domain, header = domain_name(url)

if header is None:
name = domain
else:
name = domain + '_' + header

logging.warning(f'New file with name {name}.txt')
list_of_saved_url['url'].setdefault(url, name + '.txt')
with pathlib.Path(f'{root}\\url_list.txt').open('w') as f:
json.dump(list_of_saved_url, f)
save_to = pathlib.Path(f'{root}\\url_data\\{name}.txt').open('wb')
for chunk in response.iter_content(10000):
save_to.write(chunk)
additional_info = {}
list_of_saved_url.setdefault(url, additional_info)
list_of_saved_url[url].setdefault('file_name', name)

if css_selector is not None:
new_file = pathlib.Path(f'{root}\\url_data\\{name}.txt').open('w', encoding='utf-8')
list_of_saved_url[url].setdefault('css_selector', css_selector)
with pathlib.Path(f'{root}\\url_list.txt').open('w') as f:
json.dump(list_of_saved_url, f)
bs4_object = bs4.BeautifulSoup(response.text, features="html.parser")
parsed_element = bs4_object.select(css_selector)
new_file.write(str(parsed_element[0].get_text()))

elif css_selector is None:
new_file = pathlib.Path(f'{root}\\url_data\\{name}.txt').open('wb')
list_of_saved_url[url].setdefault('css_selector', None)
for chunk in response.iter_content(10000):
new_file.write(chunk)
logging.debug(f'Stored url in json file {list_of_saved_url}')
except:
logging.error(f"Something went wrong with {url}")
Expand All @@ -51,6 +59,11 @@ def main(url, root):
logging.basicConfig(filename='..\\storage\\logging\\log.txt', level=logging.DEBUG,
format='%(levelname)s - %(message)s')
# add url manually
print('Add desired url\nurl needs to start with http:// or https://\n')
print(
'Add desired url, followed by a whitespace, followed by the unique css selector.\nurl needs to start with http:// or https://\n')
answer_url = input('@: ')
main(answer_url, '..\\storage')
clean_answer = answer_url.split(' ', maxsplit=1)
if len(clean_answer) == 2:
main(clean_answer[0], clean_answer[1], '..\\storage')
elif len(clean_answer) == 1:
main(answer_url, None, '..\\storage')
7 changes: 3 additions & 4 deletions modules/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,18 @@ def setup():
logging.basicConfig(filename='..\\storage\\logging\\log.txt', level=logging.DEBUG,
format='%(levelname)s - %(message)s')
except FileNotFoundError:
logging.warning('No logging directory')
pathlib.Path('..\\storage\\logging').mkdir(parents=True, exist_ok=True)
logging.basicConfig(filename='storage\\logging\\log.txt', level=logging.DEBUG,
format='%(levelname)s - %(message)s')
logging.debug('directory created')
logging.warning('Log directory created')

if not pathlib.Path('..\\storage\\url_data').exists():
logging.warning('No directory found')
logging.error('No directory found')
pathlib.Path('..\\storage\\url_data').mkdir(parents=True, exist_ok=True)
logging.debug('directory created')

if not pathlib.Path('..\\storage\\url_list.txt').exists():
logging.warning('No url_list.txt')
logging.error('No url_list.txt')
pathlib.Path('..\\storage\\url_list.txt').open('w')
json_url_dict = {}
with pathlib.Path('..\\storage\\url_list.txt').open('w') as f:
Expand Down
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
requests
requests~=2.26.0

beautifulsoup4~=4.10.0

0 comments on commit d3d9e53

Please sign in to comment.