diff --git a/pydork/engine.py b/pydork/engine.py index 635adbf..a65fbf5 100644 --- a/pydork/engine.py +++ b/pydork/engine.py @@ -369,8 +369,8 @@ def search(self, keyword: str, search_type='text', maximum=100): self.ENGINE.MESSAGE.print_text( url, mode='debug', - separator=": ", - header=self.ENGINE.MESSAGE.HEADER + ': ' + + separator=": ", # type: ignore + header=self.ENGINE.MESSAGE.HEADER + ': ' + \ Color.GRAY + '[DEBUG]: [TargetURL]' + Color.END ) @@ -378,8 +378,8 @@ def search(self, keyword: str, search_type='text', maximum=100): self.ENGINE.MESSAGE.print_text( self.ENGINE.USER_AGENT, mode='debug', - separator=": ", - header=self.ENGINE.MESSAGE.HEADER + ': ' + + separator=": ", # type: ignore + header=self.ENGINE.MESSAGE.HEADER + ': ' + \ Color.GRAY + '[DEBUG]: [UserAgent]' + Color.END ) @@ -391,8 +391,8 @@ def search(self, keyword: str, search_type='text', maximum=100): self.ENGINE.MESSAGE.print_text( html, mode='debug', - separator=": ", - header=self.ENGINE.MESSAGE.HEADER + ': ' + + separator=": ", # type: ignore + header=self.ENGINE.MESSAGE.HEADER + ': ' + \ Color.GRAY + '[DEBUG]: [Response]' + Color.END ) @@ -425,8 +425,8 @@ def search(self, keyword: str, search_type='text', maximum=100): # debug self.ENGINE.MESSAGE.print_text( html, - mode='debug', - header=self.ENGINE.MESSAGE.HEADER + ': ' + Color.GRAY + + mode='debug', # type: ignore + header=self.ENGINE.MESSAGE.HEADER + ': ' + Color.GRAY + \ '[DEBUG]: [ReCaptchaedResponse]' + Color.END, separator=": " ) @@ -495,7 +495,8 @@ def search(self, keyword: str, search_type='text', maximum=100): # commandの場合の出力処理 self.ENGINE.MESSAGE.print_text( - 'Finally got ' + self.ENGINE.COLOR + + # type: ignore + 'Finally got ' + self.ENGINE.COLOR + \ str(len(result)) + Color.END + ' links.', header=self.ENGINE.MESSAGE.ENGINE, separator=": ", diff --git a/pydork/engine_common.py b/pydork/engine_common.py index a8eac76..e3712cc 100644 --- a/pydork/engine_common.py +++ b/pydork/engine_common.py @@ -10,10 +10,10 @@ * SearchEngine Classから呼び出す、各検索エンジンで共通の処理を保持させる継承用Classである `CommonEngine` を持つモジュール. """ - import requests import os import pickle +import time # selenium driver auto install packages import chromedriver_autoinstaller @@ -372,12 +372,9 @@ def create_selenium_driver(self): pass self.driver = Firefox(options=options, firefox_profile=profile) - # NOTE: - # User Agentを確認する場合、↓の処理で実施可能(Chrome/Firefoxともに)。 - # ```python - # user_agent = self.driver.execute_script("return navigator.userAgent") - # print(user_agent) - # ``` + # User agentを指定させる + user_agent = self.driver.execute_script("return navigator.userAgent") + self.set_user_agent(user_agent) return @@ -578,6 +575,12 @@ def get_result(self, url: str, method='GET', data=None): if self.USE_SELENIUM: result = self.request_selenium(url, method=method, data=data) + for i in range(0, 10): + self.driver.execute_script( + "window.scrollTo(0,document.body.scrollHeight)" + ) + time.sleep(3) + # 優先度2: Splash経由でのアクセス(Seleniumが有効になってない場合はこちら) elif self.USE_SPLASH: # create splash url @@ -639,7 +642,7 @@ def get_links(self, source_url, html: str, type: str): # before processing elists self.MESSAGE.print_text( ','.join(elinks), # type: ignore - header=self.MESSAGE.HEADER + ': ' + Color.BLUE + + header=self.MESSAGE.HEADER + ': ' + Color.BLUE + \ '[BeforeProcessing elinks]' + Color.END, separator=" :", mode="debug", @@ -647,9 +650,9 @@ def get_links(self, source_url, html: str, type: str): # before processing etitles self.MESSAGE.print_text( - ','.join(etitles), - header=self.MESSAGE.HEADER + ': ' + - Color.BLUE + '[BeforeProcessing etitles]' + Color.END, + ','.join(etitles), # type: ignore + header=self.MESSAGE.HEADER + ': ' + Color.BLUE + \ + '[BeforeProcessing etitles]' + Color.END, separator=" :", mode="debug", ) @@ -661,7 +664,7 @@ def get_links(self, source_url, html: str, type: str): # after processing elists self.MESSAGE.print_text( ','.join(elinks), # type: ignore - header=self.MESSAGE.HEADER + ': ' + + header=self.MESSAGE.HEADER + ': ' + \ Color.GREEN + '[AfterProcessing elinks]' + Color.END, separator=" :", mode="debug", @@ -669,8 +672,8 @@ def get_links(self, source_url, html: str, type: str): # after processing etitles self.MESSAGE.print_text( - ','.join(etitles), - header=self.MESSAGE.HEADER + ': ' + + ','.join(etitles), # type: ignore + header=self.MESSAGE.HEADER + ': ' + \ Color.GREEN + '[AfterProcessing etitles]' + Color.END, separator=" :", mode="debug", diff --git a/pydork/engine_google.py b/pydork/engine_google.py index d064f17..a4d12b4 100644 --- a/pydork/engine_google.py +++ b/pydork/engine_google.py @@ -10,6 +10,7 @@ * Google用の検索用Classを持つモジュール. """ +# import sys import json import os @@ -18,7 +19,7 @@ from json.decoder import JSONDecodeError from urllib import parse from lxml import etree -from bs4 import BeautifulSoup +# from bs4 import BeautifulSoup from .common import Color from .recaptcha import TwoCaptcha @@ -77,12 +78,10 @@ def gen_search_url(self, keyword: str, type: str): # 検索パラメータの設定 url_param = { 'q': keyword, # 検索キーワード - 'oq': keyword, # 検索キーワード - 'num': '100', # 1ページごとの表示件数. - 'filter': '0', # 類似ページのフィルタリング(0...無効, 1...有効) - 'start': '', # 開始位置 - 'tbs': '', # 期間 - 'nfpr': '1' # もしかして検索(Escape hatch)を無効化 + 'oq': keyword, # 検索キーワード + 'num': 100, # 1ページごとの表示件数. + 'filter': 0, # 類似ページのフィルタリング(0...無効, 1...有効) + 'nfpr': 1 # もしかして検索(Escape hatch)を無効化 } # lang/localeが設定されている場合 @@ -107,17 +106,11 @@ def gen_search_url(self, keyword: str, type: str): page = 0 while True: - if page == 0: - # parameterにページを開始する番号を指定 - url_param['start'] = str(page * 100) - params = parse.urlencode(url_param) - - target_url = search_url + '?' + params + # parameterにページを開始する番号を指定 + url_param['start'] = str(page * 100) + params = parse.urlencode(url_param) - else: - target_url = self.SEARCH_NEXT_URL - if self.SEARCH_NEXT_URL is None: - break + target_url = search_url + '?' + params yield 'GET', target_url, None page += 1 @@ -209,20 +202,20 @@ def get_links(self, url: str, html: str, type: str): # Selenium経由、かつFirefoxを使っている場合 if self.USE_SELENIUM: - self.SOUP_SELECT_URL = '.yuRUbf > a' - self.SOUP_SELECT_TITLE = '.yuRUbf > a > .LC20lb' + self.SOUP_SELECT_URL = '.yuRUbf > div > a' + self.SOUP_SELECT_TITLE = '.yuRUbf > div > a > .LC20lb' self.SOUP_SELECT_TEXT = '.lEBKkf' - self.SOUP_SELECT_NEXT_URL = '.d6cvqb > a' + self.SOUP_SELECT_NEXT_URL = '.AaVjTc > tbody > tr > td > a' # Splash経由で通信している場合 elif self.USE_SPLASH: self.SOUP_SELECT_URL = '.yuRUbf > a' self.SOUP_SELECT_TITLE = '.yuRUbf > a > .LC20lb' self.SOUP_SELECT_TEXT = '.lEBKkf' - self.SOUP_SELECT_NEXT_URL = '.d6cvqb > a' + self.SOUP_SELECT_NEXT_URL = '.AaVjTc > tbody > tr > td > a' # TODO: SEARCH_NEXT_URLを書き換える - self.get_nextpage_url(html) + # self.get_nextpage_url(html) # CommonEngineの処理を呼び出す links = super().get_links(url, html, type) @@ -308,29 +301,6 @@ def get_suggest_list(self, suggests: list, char: str, html: str): return suggests - def get_nextpage_url(self, html: str): - # BeautifulSoupでの解析を実施 - soup = BeautifulSoup(html, 'lxml') - - # BeautifulSoupでnext urlの要素を確認する - elements = soup.select(self.SOUP_SELECT_NEXT_URL) - - # next urlを取得する - elinks = [e['href'] for e in elements] - - if len(elinks) == 0: - self.SEARCH_NEXT_URL = None - - elif len(elinks) == 1: - next_url = parse.urljoin( - self.ENGINE_TOP_URL, elinks[0]) # type: ignore - self.SEARCH_NEXT_URL = next_url - - elif len(elinks) > 1: - next_url = parse.urljoin( - self.ENGINE_TOP_URL, elinks[1]) # type: ignore - self.SEARCH_NEXT_URL = next_url - def processings_elist(self, elinks, etitles, etexts: list): """processings_elist diff --git a/setup.py b/setup.py index 59c15c8..b43ae43 100755 --- a/setup.py +++ b/setup.py @@ -82,8 +82,8 @@ def get_completefile_install_location(shell): name = 'pydork' -version = '1.1.5' -release = '1.1.5' +version = '1.1.6' +release = '1.1.6' if __name__ == "__main__": setuptools.setup(