From 60bc95b4a50adcac50ac4a94d2f63a79c092649c Mon Sep 17 00:00:00 2001 From: blacknon Date: Mon, 28 Aug 2023 23:54:40 +0900 Subject: [PATCH] =?UTF-8?q?update.=20=E7=84=A1=E7=90=86=E3=82=84=E3=82=8A?= =?UTF-8?q?=E3=83=9A=E3=83=BC=E3=82=B8=E3=83=B3=E3=82=B0=E3=81=95=E3=81=9B?= =?UTF-8?q?=E3=82=8B=E6=96=B9=E5=BC=8F=E3=81=A7=E5=AE=9F=E8=A3=85.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pydork/engine_google.py | 51 ++++++++--------------------------------- 1 file changed, 10 insertions(+), 41 deletions(-) diff --git a/pydork/engine_google.py b/pydork/engine_google.py index 9e5e501..a4d12b4 100644 --- a/pydork/engine_google.py +++ b/pydork/engine_google.py @@ -10,7 +10,7 @@ * Google用の検索用Classを持つモジュール. """ -import sys +# import sys import json import os @@ -19,7 +19,7 @@ from json.decoder import JSONDecodeError from urllib import parse from lxml import etree -from bs4 import BeautifulSoup +# from bs4 import BeautifulSoup from .common import Color from .recaptcha import TwoCaptcha @@ -79,9 +79,9 @@ def gen_search_url(self, keyword: str, type: str): url_param = { 'q': keyword, # 検索キーワード 'oq': keyword, # 検索キーワード - 'num': '100', # 1ページごとの表示件数. - 'filter': '0', # 類似ページのフィルタリング(0...無効, 1...有効) - 'nfpr': '1' # もしかして検索(Escape hatch)を無効化 + 'num': 100, # 1ページごとの表示件数. + 'filter': 0, # 類似ページのフィルタリング(0...無効, 1...有効) + 'nfpr': 1 # もしかして検索(Escape hatch)を無効化 } # lang/localeが設定されている場合 @@ -106,17 +106,11 @@ def gen_search_url(self, keyword: str, type: str): page = 0 while True: - if page == 0: - # parameterにページを開始する番号を指定 - url_param['start'] = str(page * 100) - params = parse.urlencode(url_param) - - target_url = search_url + '?' + params + # parameterにページを開始する番号を指定 + url_param['start'] = str(page * 100) + params = parse.urlencode(url_param) - else: - target_url = self.SEARCH_NEXT_URL - if self.SEARCH_NEXT_URL is None: - break + target_url = search_url + '?' + params yield 'GET', target_url, None page += 1 @@ -221,7 +215,7 @@ def get_links(self, url: str, html: str, type: str): self.SOUP_SELECT_NEXT_URL = '.AaVjTc > tbody > tr > td > a' # TODO: SEARCH_NEXT_URLを書き換える - self.get_nextpage_url(html) + # self.get_nextpage_url(html) # CommonEngineの処理を呼び出す links = super().get_links(url, html, type) @@ -307,31 +301,6 @@ def get_suggest_list(self, suggests: list, char: str, html: str): return suggests - def get_nextpage_url(self, html: str): - # BeautifulSoupでの解析を実施 - soup = BeautifulSoup(html, 'lxml') - - # BeautifulSoupでnext urlの要素を確認する - elements = soup.select(self.SOUP_SELECT_NEXT_URL) - - print(elements, file=sys.stderr) - # next urlのリストを取得する - elinks = [e['href'] for e in elements] - - if len(elinks) == 0: - self.SEARCH_NEXT_URL = None - - elif len(elinks) == 1: - next_url = parse.urljoin( - self.ENGINE_TOP_URL, elinks[0]) # type: ignore - self.SEARCH_NEXT_URL = next_url - - elif len(elinks) > 1: - # DEBUG: なんかおかしいのでhtml確認して対応 - next_url = parse.urljoin( - self.ENGINE_TOP_URL, elinks[1]) # type: ignore - self.SEARCH_NEXT_URL = next_url - def processings_elist(self, elinks, etitles, etexts: list): """processings_elist