Skip to content

Commit

Permalink
update. Google検索動かなくなってたのを対応. なぜか100件以上取れなくなった(次ページのデータ出てこなくなった)
Browse files Browse the repository at this point in the history
  • Loading branch information
blacknon committed Aug 28, 2023
1 parent 6272191 commit fd14e40
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 14 deletions.
17 changes: 11 additions & 6 deletions pydork/engine_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
* SearchEngine Classから呼び出す、各検索エンジンで共通の処理を保持させる継承用Classである `CommonEngine` を持つモジュール.
"""

import sys

import requests
import os
import pickle
import time

# selenium driver auto install packages
import chromedriver_autoinstaller
Expand Down Expand Up @@ -372,12 +374,9 @@ def create_selenium_driver(self):
pass
self.driver = Firefox(options=options, firefox_profile=profile)

# NOTE:
# User Agentを確認する場合、↓の処理で実施可能(Chrome/Firefoxともに)。
# ```python
# user_agent = self.driver.execute_script("return navigator.userAgent")
# print(user_agent)
# ```
# User agentを指定させる
user_agent = self.driver.execute_script("return navigator.userAgent")
self.set_user_agent(user_agent)

return

Expand Down Expand Up @@ -578,6 +577,12 @@ def get_result(self, url: str, method='GET', data=None):
if self.USE_SELENIUM:
result = self.request_selenium(url, method=method, data=data)

for i in range(0, 10):
self.driver.execute_script(
"window.scrollTo(0,document.body.scrollHeight)"
)
time.sleep(3)

# 優先度2: Splash経由でのアクセス(Seleniumが有効になってない場合はこちら)
elif self.USE_SPLASH:
# create splash url
Expand Down
17 changes: 9 additions & 8 deletions pydork/engine_google.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
* Google用の検索用Classを持つモジュール.
"""

import sys

import json
import os
Expand Down Expand Up @@ -77,11 +78,9 @@ def gen_search_url(self, keyword: str, type: str):
# 検索パラメータの設定
url_param = {
'q': keyword, # 検索キーワード
'oq': keyword, # 検索キーワード
'oq': keyword, # 検索キーワード
'num': '100', # 1ページごとの表示件数.
'filter': '0', # 類似ページのフィルタリング(0...無効, 1...有効)
'start': '', # 開始位置
'tbs': '', # 期間
'nfpr': '1' # もしかして検索(Escape hatch)を無効化
}

Expand Down Expand Up @@ -209,17 +208,17 @@ def get_links(self, url: str, html: str, type: str):

# Selenium経由、かつFirefoxを使っている場合
if self.USE_SELENIUM:
self.SOUP_SELECT_URL = '.yuRUbf > a'
self.SOUP_SELECT_TITLE = '.yuRUbf > a > .LC20lb'
self.SOUP_SELECT_URL = '.yuRUbf > div > a'
self.SOUP_SELECT_TITLE = '.yuRUbf > div > a > .LC20lb'
self.SOUP_SELECT_TEXT = '.lEBKkf'
self.SOUP_SELECT_NEXT_URL = '.d6cvqb > a'
self.SOUP_SELECT_NEXT_URL = '.AaVjTc > tbody > tr > td > a'

# Splash経由で通信している場合
elif self.USE_SPLASH:
self.SOUP_SELECT_URL = '.yuRUbf > a'
self.SOUP_SELECT_TITLE = '.yuRUbf > a > .LC20lb'
self.SOUP_SELECT_TEXT = '.lEBKkf'
self.SOUP_SELECT_NEXT_URL = '.d6cvqb > a'
self.SOUP_SELECT_NEXT_URL = '.AaVjTc > tbody > tr > td > a'

# TODO: SEARCH_NEXT_URLを書き換える
self.get_nextpage_url(html)
Expand Down Expand Up @@ -315,7 +314,8 @@ def get_nextpage_url(self, html: str):
# BeautifulSoupでnext urlの要素を確認する
elements = soup.select(self.SOUP_SELECT_NEXT_URL)

# next urlを取得する
print(elements, file=sys.stderr)
# next urlのリストを取得する
elinks = [e['href'] for e in elements]

if len(elinks) == 0:
Expand All @@ -327,6 +327,7 @@ def get_nextpage_url(self, html: str):
self.SEARCH_NEXT_URL = next_url

elif len(elinks) > 1:
# DEBUG: なんかおかしいのでhtml確認して対応
next_url = parse.urljoin(
self.ENGINE_TOP_URL, elinks[1]) # type: ignore
self.SEARCH_NEXT_URL = next_url
Expand Down

0 comments on commit fd14e40

Please sign in to comment.