-
Notifications
You must be signed in to change notification settings - Fork 0
/
wb_selenium.py
181 lines (151 loc) · 7.25 KB
/
wb_selenium.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import selenium.webdriver.support.expected_conditions as ec
from time import sleep
# import json
# import fake_useragent
URL = 'https://www.wildberries.ru/catalog/muzhchinam/odezhda/dzhempery-i-kardigany'
class MySelenium:
db = dict()
name_category = ''
def __init__(self):
self.driver = Chrome()
chrome_options = Options()
chrome_options.page_load_strategy = 'normal'
chrome_options.add_argument("--disable-extensions")
chrome_options.headless = True
self.driver.maximize_window()
self.driver.implicitly_wait(5)
def scroll_up(self, obj):
while not obj.is_displayed():
self.driver.execute_script("window.scrollBy(0,-50)", "")
def scroll_down(self, obj):
while not obj.is_displayed():
self.driver.execute_script("window.scrollBy(0,50)", "")
# for i in range(250):
# self.driver.execute_script("window.scrollBy(0,50)", "")
# WebDriverWait(self.driver, 1)
# collect all product links from the specified number of pages
def parse_pages(self, url, count_page=0):
current_page = 1
indx = 1
for number in range(count_page):
self.driver.get(url)
WebDriverWait(self.driver, 5).until(
ec.visibility_of_element_located((By.CSS_SELECTOR, "div.catalog-title-wrap>span>span")))
footer = self.driver.find_element(By.CLASS_NAME, 'footer__copyrights')
self.scroll_down(footer)
if not footer.is_displayed():
self.scroll_down(footer)
cards = self.driver.find_elements(By.XPATH, "//div[@class='product-card__wrapper']/a")
for card in cards:
try:
link = card.get_attribute('href')
self.db[indx] = [link]
indx += 1
except Exception as e:
print(e)
continue
if current_page <= count_page:
current_page += 1
else:
break
# pagination
next_page_url = self.driver.find_element(By.CSS_SELECTOR, 'a.j-next-page')
if next_page_url:
url = next_page_url.get_attribute('href')
else:
print("This is the last page. Data collection completed")
break
# set number of pages / start crawling pages
def set_count_pages_and_run(self, url, count_page=0):
self.driver.get(url)
WebDriverWait(self.driver, 5).until(
ec.visibility_of_element_located((By.CSS_SELECTOR, "div.catalog-title-wrap>span>span")))
self.name_category = self.driver.find_element(
By.CSS_SELECTOR, "div.catalog-title-wrap>h1"
).text.strip()
count_all_items = self.driver.find_element(By.CSS_SELECTOR, "div.catalog-title-wrap>span>span").text
count_all_items = int(count_all_items.replace(' ', ''))
pages = round(count_all_items / 100)
if count_page == 0:
# parse all product links by category
self.parse_pages(url=url, count_page=pages)
else:
# parse links from the specified number of pages
self.parse_pages(url=url, count_page=count_page)
# collection of all information from the list of links
def page_item_info(self):
"""
Collects product information:
name, brand, article, size, color, price, certified_quality,
additional information, compound, description
:return: dict(id: [item_info])
"""
for k, v in self.db.items():
self.driver.get(v[0])
WebDriverWait(self.driver, 5).until(
ec.visibility_of_element_located((By.CSS_SELECTOR, 'div.details-section__inner-wrap')))
# block - name, brand, article
raw_name = self.driver.find_element(
By.CSS_SELECTOR, 'div.product-page__header-wrap>div:nth-child(1)')
brand = raw_name.find_element(By.TAG_NAME, 'span').text.strip()
name = raw_name.find_element(By.TAG_NAME, 'h1').text.strip()
article = self.driver.find_element(By.CSS_SELECTOR, 'p.product-article').text.strip()
# block - size, color, price
size_list = self.driver.find_elements(By.XPATH, "//label[@class='j-size']")
size_list = [i.text.replace('\n', ' ') for i in size_list]
color_list = self.driver.find_elements(By.CSS_SELECTOR, "li.j-color>a>img")
color_list = [img.get_attribute('title').strip() for img in color_list]
price_now = self.driver.find_elements(By.CSS_SELECTOR, 'div.price-block__content>p>span')[1].text
# price_old = self.driver.find_element(By.CSS_SELECTOR, "div.price-block__content>p>del")
# block - about the product (left side)
buttons_unwrap = self.driver.find_elements(By.CSS_SELECTOR, 'div.collapsible__toggle-wrap>button')
while not buttons_unwrap[1].is_displayed():
self.driver.execute_script("window.scrollBy(0,25)", "")
# unwrap characteristic and description
buttons_unwrap[1].click()
buttons_unwrap[0].click()
try:
certified_quality = self.driver.find_element(By.CSS_SELECTOR, 'p.certificate-check__text').text
except ExceptionGroup:
certified_quality = "missing"
table_info = self.driver.find_elements(By.CSS_SELECTOR, '.j-add-info-section>div>table>tbody>tr')
additional_info = {}
for tr in table_info:
th = tr.find_element(By.CSS_SELECTOR, 'th.product-params__cell').text.strip()
td = tr.find_element(By.CSS_SELECTOR, 'td.product-params__cell').text.strip()
additional_info[th] = td
# block - about the product (right side)
sections = self.driver.find_elements(By.CSS_SELECTOR, 'section.details-section__details')
# len(sections) = 3
description = {
sections[0].find_element(By.CSS_SELECTOR, 'h3.details__header').text.strip():
sections[0].find_element(By.CSS_SELECTOR, 'p.collapsable__text').text.strip()}
compound = {
sections[1].find_element(By.CSS_SELECTOR, 'h3.details__header').text.strip():
sections[1].find_element(By.CSS_SELECTOR, 'div.j-consist').text.strip()}
print(brand,
name,
article,
size_list,
color_list,
price_now,
certified_quality,
additional_info,
description,
compound, sep='\n')
break
if __name__ == '__main__':
HOST = f"https://www.wildberries.ru/"
# FAKE_USER = fake_useragent.FakeUserAgent().random
# HEADERS = {'User-Agent': FAKE_USER}
s = MySelenium()
# s.set_count_pages_and_run(URL, 3)
item_link = 'https://www.wildberries.ru/catalog/141012690/detail.aspx'
s.db[1] = [item_link, ]
s.page_item_info()
sleep(1)
s.driver.quit()