From fd14e408ef108c8c22943439e0f1491debb8d6cd Mon Sep 17 00:00:00 2001 From: blacknon Date: Mon, 28 Aug 2023 23:39:43 +0900 Subject: [PATCH 1/4] =?UTF-8?q?update.=20Google=E6=A4=9C=E7=B4=A2=E5=8B=95?= =?UTF-8?q?=E3=81=8B=E3=81=AA=E3=81=8F=E3=81=AA=E3=81=A3=E3=81=A6=E3=81=9F?= =?UTF-8?q?=E3=81=AE=E3=82=92=E5=AF=BE=E5=BF=9C.=20=E3=81=AA=E3=81=9C?= =?UTF-8?q?=E3=81=8B100=E4=BB=B6=E4=BB=A5=E4=B8=8A=E5=8F=96=E3=82=8C?= =?UTF-8?q?=E3=81=AA=E3=81=8F=E3=81=AA=E3=81=A3=E3=81=9F(=E6=AC=A1?= =?UTF-8?q?=E3=83=9A=E3=83=BC=E3=82=B8=E3=81=AE=E3=83=87=E3=83=BC=E3=82=BF?= =?UTF-8?q?=E5=87=BA=E3=81=A6=E3=81=93=E3=81=AA=E3=81=8F=E3=81=AA=E3=81=A3?= =?UTF-8?q?=E3=81=9F)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pydork/engine_common.py | 17 +++++++++++------ pydork/engine_google.py | 17 +++++++++-------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/pydork/engine_common.py b/pydork/engine_common.py index a8eac76..7b649df 100644 --- a/pydork/engine_common.py +++ b/pydork/engine_common.py @@ -10,10 +10,12 @@ * SearchEngine Classから呼び出す、各検索エンジンで共通の処理を保持させる継承用Classである `CommonEngine` を持つモジュール. """ +import sys import requests import os import pickle +import time # selenium driver auto install packages import chromedriver_autoinstaller @@ -372,12 +374,9 @@ def create_selenium_driver(self): pass self.driver = Firefox(options=options, firefox_profile=profile) - # NOTE: - # User Agentを確認する場合、↓の処理で実施可能(Chrome/Firefoxともに)。 - # ```python - # user_agent = self.driver.execute_script("return navigator.userAgent") - # print(user_agent) - # ``` + # User agentを指定させる + user_agent = self.driver.execute_script("return navigator.userAgent") + self.set_user_agent(user_agent) return @@ -578,6 +577,12 @@ def get_result(self, url: str, method='GET', data=None): if self.USE_SELENIUM: result = self.request_selenium(url, method=method, data=data) + for i in range(0, 10): + self.driver.execute_script( + "window.scrollTo(0,document.body.scrollHeight)" + ) + time.sleep(3) + # 優先度2: Splash経由でのアクセス(Seleniumが有効になってない場合はこちら) elif self.USE_SPLASH: # create splash url diff --git a/pydork/engine_google.py b/pydork/engine_google.py index d064f17..9e5e501 100644 --- a/pydork/engine_google.py +++ b/pydork/engine_google.py @@ -10,6 +10,7 @@ * Google用の検索用Classを持つモジュール. """ +import sys import json import os @@ -77,11 +78,9 @@ def gen_search_url(self, keyword: str, type: str): # 検索パラメータの設定 url_param = { 'q': keyword, # 検索キーワード - 'oq': keyword, # 検索キーワード + 'oq': keyword, # 検索キーワード 'num': '100', # 1ページごとの表示件数. 'filter': '0', # 類似ページのフィルタリング(0...無効, 1...有効) - 'start': '', # 開始位置 - 'tbs': '', # 期間 'nfpr': '1' # もしかして検索(Escape hatch)を無効化 } @@ -209,17 +208,17 @@ def get_links(self, url: str, html: str, type: str): # Selenium経由、かつFirefoxを使っている場合 if self.USE_SELENIUM: - self.SOUP_SELECT_URL = '.yuRUbf > a' - self.SOUP_SELECT_TITLE = '.yuRUbf > a > .LC20lb' + self.SOUP_SELECT_URL = '.yuRUbf > div > a' + self.SOUP_SELECT_TITLE = '.yuRUbf > div > a > .LC20lb' self.SOUP_SELECT_TEXT = '.lEBKkf' - self.SOUP_SELECT_NEXT_URL = '.d6cvqb > a' + self.SOUP_SELECT_NEXT_URL = '.AaVjTc > tbody > tr > td > a' # Splash経由で通信している場合 elif self.USE_SPLASH: self.SOUP_SELECT_URL = '.yuRUbf > a' self.SOUP_SELECT_TITLE = '.yuRUbf > a > .LC20lb' self.SOUP_SELECT_TEXT = '.lEBKkf' - self.SOUP_SELECT_NEXT_URL = '.d6cvqb > a' + self.SOUP_SELECT_NEXT_URL = '.AaVjTc > tbody > tr > td > a' # TODO: SEARCH_NEXT_URLを書き換える self.get_nextpage_url(html) @@ -315,7 +314,8 @@ def get_nextpage_url(self, html: str): # BeautifulSoupでnext urlの要素を確認する elements = soup.select(self.SOUP_SELECT_NEXT_URL) - # next urlを取得する + print(elements, file=sys.stderr) + # next urlのリストを取得する elinks = [e['href'] for e in elements] if len(elinks) == 0: @@ -327,6 +327,7 @@ def get_nextpage_url(self, html: str): self.SEARCH_NEXT_URL = next_url elif len(elinks) > 1: + # DEBUG: なんかおかしいのでhtml確認して対応 next_url = parse.urljoin( self.ENGINE_TOP_URL, elinks[1]) # type: ignore self.SEARCH_NEXT_URL = next_url From 60bc95b4a50adcac50ac4a94d2f63a79c092649c Mon Sep 17 00:00:00 2001 From: blacknon Date: Mon, 28 Aug 2023 23:54:40 +0900 Subject: [PATCH 2/4] =?UTF-8?q?update.=20=E7=84=A1=E7=90=86=E3=82=84?= =?UTF-8?q?=E3=82=8A=E3=83=9A=E3=83=BC=E3=82=B8=E3=83=B3=E3=82=B0=E3=81=95?= =?UTF-8?q?=E3=81=9B=E3=82=8B=E6=96=B9=E5=BC=8F=E3=81=A7=E5=AE=9F=E8=A3=85?= =?UTF-8?q?.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pydork/engine_google.py | 51 ++++++++--------------------------------- 1 file changed, 10 insertions(+), 41 deletions(-) diff --git a/pydork/engine_google.py b/pydork/engine_google.py index 9e5e501..a4d12b4 100644 --- a/pydork/engine_google.py +++ b/pydork/engine_google.py @@ -10,7 +10,7 @@ * Google用の検索用Classを持つモジュール. """ -import sys +# import sys import json import os @@ -19,7 +19,7 @@ from json.decoder import JSONDecodeError from urllib import parse from lxml import etree -from bs4 import BeautifulSoup +# from bs4 import BeautifulSoup from .common import Color from .recaptcha import TwoCaptcha @@ -79,9 +79,9 @@ def gen_search_url(self, keyword: str, type: str): url_param = { 'q': keyword, # 検索キーワード 'oq': keyword, # 検索キーワード - 'num': '100', # 1ページごとの表示件数. - 'filter': '0', # 類似ページのフィルタリング(0...無効, 1...有効) - 'nfpr': '1' # もしかして検索(Escape hatch)を無効化 + 'num': 100, # 1ページごとの表示件数. + 'filter': 0, # 類似ページのフィルタリング(0...無効, 1...有効) + 'nfpr': 1 # もしかして検索(Escape hatch)を無効化 } # lang/localeが設定されている場合 @@ -106,17 +106,11 @@ def gen_search_url(self, keyword: str, type: str): page = 0 while True: - if page == 0: - # parameterにページを開始する番号を指定 - url_param['start'] = str(page * 100) - params = parse.urlencode(url_param) - - target_url = search_url + '?' + params + # parameterにページを開始する番号を指定 + url_param['start'] = str(page * 100) + params = parse.urlencode(url_param) - else: - target_url = self.SEARCH_NEXT_URL - if self.SEARCH_NEXT_URL is None: - break + target_url = search_url + '?' + params yield 'GET', target_url, None page += 1 @@ -221,7 +215,7 @@ def get_links(self, url: str, html: str, type: str): self.SOUP_SELECT_NEXT_URL = '.AaVjTc > tbody > tr > td > a' # TODO: SEARCH_NEXT_URLを書き換える - self.get_nextpage_url(html) + # self.get_nextpage_url(html) # CommonEngineの処理を呼び出す links = super().get_links(url, html, type) @@ -307,31 +301,6 @@ def get_suggest_list(self, suggests: list, char: str, html: str): return suggests - def get_nextpage_url(self, html: str): - # BeautifulSoupでの解析を実施 - soup = BeautifulSoup(html, 'lxml') - - # BeautifulSoupでnext urlの要素を確認する - elements = soup.select(self.SOUP_SELECT_NEXT_URL) - - print(elements, file=sys.stderr) - # next urlのリストを取得する - elinks = [e['href'] for e in elements] - - if len(elinks) == 0: - self.SEARCH_NEXT_URL = None - - elif len(elinks) == 1: - next_url = parse.urljoin( - self.ENGINE_TOP_URL, elinks[0]) # type: ignore - self.SEARCH_NEXT_URL = next_url - - elif len(elinks) > 1: - # DEBUG: なんかおかしいのでhtml確認して対応 - next_url = parse.urljoin( - self.ENGINE_TOP_URL, elinks[1]) # type: ignore - self.SEARCH_NEXT_URL = next_url - def processings_elist(self, elinks, etitles, etexts: list): """processings_elist From ad9a74903d8f1e77346304cc67d68d5667bad65f Mon Sep 17 00:00:00 2001 From: blacknon Date: Tue, 29 Aug 2023 10:23:08 +0900 Subject: [PATCH 3/4] =?UTF-8?q?update.=20warning=E5=AF=BE=E5=BF=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pydork/engine.py | 19 ++++++++++--------- pydork/engine_common.py | 16 +++++++--------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/pydork/engine.py b/pydork/engine.py index 635adbf..a65fbf5 100644 --- a/pydork/engine.py +++ b/pydork/engine.py @@ -369,8 +369,8 @@ def search(self, keyword: str, search_type='text', maximum=100): self.ENGINE.MESSAGE.print_text( url, mode='debug', - separator=": ", - header=self.ENGINE.MESSAGE.HEADER + ': ' + + separator=": ", # type: ignore + header=self.ENGINE.MESSAGE.HEADER + ': ' + \ Color.GRAY + '[DEBUG]: [TargetURL]' + Color.END ) @@ -378,8 +378,8 @@ def search(self, keyword: str, search_type='text', maximum=100): self.ENGINE.MESSAGE.print_text( self.ENGINE.USER_AGENT, mode='debug', - separator=": ", - header=self.ENGINE.MESSAGE.HEADER + ': ' + + separator=": ", # type: ignore + header=self.ENGINE.MESSAGE.HEADER + ': ' + \ Color.GRAY + '[DEBUG]: [UserAgent]' + Color.END ) @@ -391,8 +391,8 @@ def search(self, keyword: str, search_type='text', maximum=100): self.ENGINE.MESSAGE.print_text( html, mode='debug', - separator=": ", - header=self.ENGINE.MESSAGE.HEADER + ': ' + + separator=": ", # type: ignore + header=self.ENGINE.MESSAGE.HEADER + ': ' + \ Color.GRAY + '[DEBUG]: [Response]' + Color.END ) @@ -425,8 +425,8 @@ def search(self, keyword: str, search_type='text', maximum=100): # debug self.ENGINE.MESSAGE.print_text( html, - mode='debug', - header=self.ENGINE.MESSAGE.HEADER + ': ' + Color.GRAY + + mode='debug', # type: ignore + header=self.ENGINE.MESSAGE.HEADER + ': ' + Color.GRAY + \ '[DEBUG]: [ReCaptchaedResponse]' + Color.END, separator=": " ) @@ -495,7 +495,8 @@ def search(self, keyword: str, search_type='text', maximum=100): # commandの場合の出力処理 self.ENGINE.MESSAGE.print_text( - 'Finally got ' + self.ENGINE.COLOR + + # type: ignore + 'Finally got ' + self.ENGINE.COLOR + \ str(len(result)) + Color.END + ' links.', header=self.ENGINE.MESSAGE.ENGINE, separator=": ", diff --git a/pydork/engine_common.py b/pydork/engine_common.py index 7b649df..e3712cc 100644 --- a/pydork/engine_common.py +++ b/pydork/engine_common.py @@ -10,8 +10,6 @@ * SearchEngine Classから呼び出す、各検索エンジンで共通の処理を保持させる継承用Classである `CommonEngine` を持つモジュール. """ -import sys - import requests import os import pickle @@ -644,7 +642,7 @@ def get_links(self, source_url, html: str, type: str): # before processing elists self.MESSAGE.print_text( ','.join(elinks), # type: ignore - header=self.MESSAGE.HEADER + ': ' + Color.BLUE + + header=self.MESSAGE.HEADER + ': ' + Color.BLUE + \ '[BeforeProcessing elinks]' + Color.END, separator=" :", mode="debug", @@ -652,9 +650,9 @@ def get_links(self, source_url, html: str, type: str): # before processing etitles self.MESSAGE.print_text( - ','.join(etitles), - header=self.MESSAGE.HEADER + ': ' + - Color.BLUE + '[BeforeProcessing etitles]' + Color.END, + ','.join(etitles), # type: ignore + header=self.MESSAGE.HEADER + ': ' + Color.BLUE + \ + '[BeforeProcessing etitles]' + Color.END, separator=" :", mode="debug", ) @@ -666,7 +664,7 @@ def get_links(self, source_url, html: str, type: str): # after processing elists self.MESSAGE.print_text( ','.join(elinks), # type: ignore - header=self.MESSAGE.HEADER + ': ' + + header=self.MESSAGE.HEADER + ': ' + \ Color.GREEN + '[AfterProcessing elinks]' + Color.END, separator=" :", mode="debug", @@ -674,8 +672,8 @@ def get_links(self, source_url, html: str, type: str): # after processing etitles self.MESSAGE.print_text( - ','.join(etitles), - header=self.MESSAGE.HEADER + ': ' + + ','.join(etitles), # type: ignore + header=self.MESSAGE.HEADER + ': ' + \ Color.GREEN + '[AfterProcessing etitles]' + Color.END, separator=" :", mode="debug", From 5c0cd91317ae8ee71c0742d69e8aa846bb7f2e92 Mon Sep 17 00:00:00 2001 From: blacknon Date: Sun, 3 Sep 2023 23:29:04 +0900 Subject: [PATCH 4/4] update. v1.1.6 --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 59c15c8..b43ae43 100755 --- a/setup.py +++ b/setup.py @@ -82,8 +82,8 @@ def get_completefile_install_location(shell): name = 'pydork' -version = '1.1.5' -release = '1.1.5' +version = '1.1.6' +release = '1.1.6' if __name__ == "__main__": setuptools.setup(