Merge pull request #19 from blacknon/develop

Version 1.1.6
blacknon · Sep 3, 2023 · 733087b · 733087b
2 parents 3ee7cd1 + 5c0cd91
commit 733087b
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 70 deletions.
diff --git a/pydork/engine.py b/pydork/engine.py
@@ -369,17 +369,17 @@ def search(self, keyword: str, search_type='text', maximum=100):
  self.ENGINE.MESSAGE.print_text(
  url,
  mode='debug',
- separator=": ",
- header=self.ENGINE.MESSAGE.HEADER + ': ' +
+ separator=": ", # type: ignore
+ header=self.ENGINE.MESSAGE.HEADER + ': ' + \
  Color.GRAY + '[DEBUG]: [TargetURL]' + Color.END
  )
 
  # debug
  self.ENGINE.MESSAGE.print_text(
  self.ENGINE.USER_AGENT,
  mode='debug',
- separator=": ",
- header=self.ENGINE.MESSAGE.HEADER + ': ' +
+ separator=": ", # type: ignore
+ header=self.ENGINE.MESSAGE.HEADER + ': ' + \
  Color.GRAY + '[DEBUG]: [UserAgent]' + Color.END
  )
 
@@ -391,8 +391,8 @@ def search(self, keyword: str, search_type='text', maximum=100):
  self.ENGINE.MESSAGE.print_text(
  html,
  mode='debug',
- separator=": ",
- header=self.ENGINE.MESSAGE.HEADER + ': ' +
+ separator=": ", # type: ignore
+ header=self.ENGINE.MESSAGE.HEADER + ': ' + \
  Color.GRAY + '[DEBUG]: [Response]' + Color.END
  )
 
@@ -425,8 +425,8 @@ def search(self, keyword: str, search_type='text', maximum=100):
  # debug
  self.ENGINE.MESSAGE.print_text(
  html,
- mode='debug',
- header=self.ENGINE.MESSAGE.HEADER + ': ' + Color.GRAY +
+ mode='debug', # type: ignore
+ header=self.ENGINE.MESSAGE.HEADER + ': ' + Color.GRAY + \
  '[DEBUG]: [ReCaptchaedResponse]' + Color.END,
  separator=": "
  )
@@ -495,7 +495,8 @@ def search(self, keyword: str, search_type='text', maximum=100):
 
  # commandの場合の出力処理
  self.ENGINE.MESSAGE.print_text(
- 'Finally got ' + self.ENGINE.COLOR +
+ # type: ignore
+ 'Finally got ' + self.ENGINE.COLOR + \
  str(len(result)) + Color.END + ' links.',
  header=self.ENGINE.MESSAGE.ENGINE,
  separator=": ",

diff --git a/pydork/engine_common.py b/pydork/engine_common.py
@@ -10,10 +10,10 @@
  * SearchEngine Classから呼び出す、各検索エンジンで共通の処理を保持させる継承用Classである `CommonEngine` を持つモジュール.
 """
 
-
 import requests
 import os
 import pickle
+import time
 
 # selenium driver auto install packages
 import chromedriver_autoinstaller
@@ -372,12 +372,9 @@ def create_selenium_driver(self):
  pass
  self.driver = Firefox(options=options, firefox_profile=profile)
 
- # NOTE:
- # User Agentを確認する場合、↓の処理で実施可能(Chrome/Firefoxともに)。
- # ```python
- # user_agent = self.driver.execute_script("return navigator.userAgent")
- # print(user_agent)
- # ```
+ # User agentを指定させる
+ user_agent = self.driver.execute_script("return navigator.userAgent")
+ self.set_user_agent(user_agent)
 
  return
 
@@ -578,6 +575,12 @@ def get_result(self, url: str, method='GET', data=None):
  if self.USE_SELENIUM:
  result = self.request_selenium(url, method=method, data=data)
 
+ for i in range(0, 10):
+ self.driver.execute_script(
+ "window.scrollTo(0,document.body.scrollHeight)"
+ )
+ time.sleep(3)
+
  # 優先度2: Splash経由でのアクセス(Seleniumが有効になってない場合はこちら)
  elif self.USE_SPLASH:
  # create splash url
@@ -639,17 +642,17 @@ def get_links(self, source_url, html: str, type: str):
  # before processing elists
  self.MESSAGE.print_text(
  ','.join(elinks), # type: ignore
- header=self.MESSAGE.HEADER + ': ' + Color.BLUE +
+ header=self.MESSAGE.HEADER + ': ' + Color.BLUE + \
  '[BeforeProcessing elinks]' + Color.END,
  separator=" :",
  mode="debug",
  )
 
  # before processing etitles
  self.MESSAGE.print_text(
- ','.join(etitles),
- header=self.MESSAGE.HEADER + ': ' +
- Color.BLUE + '[BeforeProcessing etitles]' + Color.END,
+ ','.join(etitles), # type: ignore
+ header=self.MESSAGE.HEADER + ': ' + Color.BLUE + \
+ '[BeforeProcessing etitles]' + Color.END,
  separator=" :",
  mode="debug",
  )
@@ -661,16 +664,16 @@ def get_links(self, source_url, html: str, type: str):
  # after processing elists
  self.MESSAGE.print_text(
  ','.join(elinks), # type: ignore
- header=self.MESSAGE.HEADER + ': ' +
+ header=self.MESSAGE.HEADER + ': ' + \
  Color.GREEN + '[AfterProcessing elinks]' + Color.END,
  separator=" :",
  mode="debug",
  )
 
  # after processing etitles
  self.MESSAGE.print_text(
- ','.join(etitles),
- header=self.MESSAGE.HEADER + ': ' +
+ ','.join(etitles), # type: ignore
+ header=self.MESSAGE.HEADER + ': ' + \
  Color.GREEN + '[AfterProcessing etitles]' + Color.END,
  separator=" :",
  mode="debug",

diff --git a/pydork/engine_google.py b/pydork/engine_google.py
@@ -10,6 +10,7 @@
  * Google用の検索用Classを持つモジュール.
 """
 
+# import sys
 
 import json
 import os
@@ -18,7 +19,7 @@
 from json.decoder import JSONDecodeError
 from urllib import parse
 from lxml import etree
-from bs4 import BeautifulSoup
+# from bs4 import BeautifulSoup
 
 from .common import Color
 from .recaptcha import TwoCaptcha
@@ -77,12 +78,10 @@ def gen_search_url(self, keyword: str, type: str):
  # 検索パラメータの設定
  url_param = {
  'q': keyword, # 検索キーワード
- 'oq': keyword, # 検索キーワード
- 'num': '100', # 1ページごとの表示件数.
- 'filter': '0', # 類似ページのフィルタリング(0...無効, 1...有効)
- 'start': '', # 開始位置
- 'tbs': '', # 期間
- 'nfpr': '1' # もしかして検索(Escape hatch)を無効化
+ 'oq': keyword, # 検索キーワード
+ 'num': 100, # 1ページごとの表示件数.
+ 'filter': 0, # 類似ページのフィルタリング(0...無効, 1...有効)
+ 'nfpr': 1 # もしかして検索(Escape hatch)を無効化
  }
 
  # lang/localeが設定されている場合
@@ -107,17 +106,11 @@ def gen_search_url(self, keyword: str, type: str):
 
  page = 0
  while True:
- if page == 0:
- # parameterにページを開始する番号を指定
- url_param['start'] = str(page * 100)
- params = parse.urlencode(url_param)
-
- target_url = search_url + '?' + params
+ # parameterにページを開始する番号を指定
+ url_param['start'] = str(page * 100)
+ params = parse.urlencode(url_param)
 
- else:
- target_url = self.SEARCH_NEXT_URL
- if self.SEARCH_NEXT_URL is None:
- break
+ target_url = search_url + '?' + params
 
  yield 'GET', target_url, None
  page += 1
@@ -209,20 +202,20 @@ def get_links(self, url: str, html: str, type: str):
 
  # Selenium経由、かつFirefoxを使っている場合
  if self.USE_SELENIUM:
- self.SOUP_SELECT_URL = '.yuRUbf > a'
- self.SOUP_SELECT_TITLE = '.yuRUbf > a > .LC20lb'
+ self.SOUP_SELECT_URL = '.yuRUbf > div > a'
+ self.SOUP_SELECT_TITLE = '.yuRUbf > div > a > .LC20lb'
  self.SOUP_SELECT_TEXT = '.lEBKkf'
- self.SOUP_SELECT_NEXT_URL = '.d6cvqb > a'
+ self.SOUP_SELECT_NEXT_URL = '.AaVjTc > tbody > tr > td > a'
 
  # Splash経由で通信している場合
  elif self.USE_SPLASH:
  self.SOUP_SELECT_URL = '.yuRUbf > a'
  self.SOUP_SELECT_TITLE = '.yuRUbf > a > .LC20lb'
  self.SOUP_SELECT_TEXT = '.lEBKkf'
- self.SOUP_SELECT_NEXT_URL = '.d6cvqb > a'
+ self.SOUP_SELECT_NEXT_URL = '.AaVjTc > tbody > tr > td > a'
 
  # TODO: SEARCH_NEXT_URLを書き換える
- self.get_nextpage_url(html)
+ # self.get_nextpage_url(html)
 
  # CommonEngineの処理を呼び出す
  links = super().get_links(url, html, type)
@@ -308,29 +301,6 @@ def get_suggest_list(self, suggests: list, char: str, html: str):
 
  return suggests
 
- def get_nextpage_url(self, html: str):
- # BeautifulSoupでの解析を実施
- soup = BeautifulSoup(html, 'lxml')
-
- # BeautifulSoupでnext urlの要素を確認する
- elements = soup.select(self.SOUP_SELECT_NEXT_URL)
-
- # next urlを取得する
- elinks = [e['href'] for e in elements]
-
- if len(elinks) == 0:
- self.SEARCH_NEXT_URL = None
-
- elif len(elinks) == 1:
- next_url = parse.urljoin(
- self.ENGINE_TOP_URL, elinks[0]) # type: ignore
- self.SEARCH_NEXT_URL = next_url
-
- elif len(elinks) > 1:
- next_url = parse.urljoin(
- self.ENGINE_TOP_URL, elinks[1]) # type: ignore
- self.SEARCH_NEXT_URL = next_url
-
  def processings_elist(self, elinks, etitles, etexts: list):
  """processings_elist
 

diff --git a/setup.py b/setup.py
@@ -82,8 +82,8 @@ def get_completefile_install_location(shell):
 
 
 name = 'pydork'
-version = '1.1.5'
-release = '1.1.5'
+version = '1.1.6'
+release = '1.1.6'
 
 if __name__ == "__main__":
  setuptools.setup(