Skip to content

Commit

Permalink
Merge pull request #19 from blacknon/develop
Browse files Browse the repository at this point in the history
Version 1.1.6
  • Loading branch information
blacknon committed Sep 3, 2023
2 parents 3ee7cd1 + 5c0cd91 commit 733087b
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 70 deletions.
19 changes: 10 additions & 9 deletions pydork/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,17 +369,17 @@ def search(self, keyword: str, search_type='text', maximum=100):
self.ENGINE.MESSAGE.print_text(
url,
mode='debug',
separator=": ",
header=self.ENGINE.MESSAGE.HEADER + ': ' +
separator=": ", # type: ignore
header=self.ENGINE.MESSAGE.HEADER + ': ' + \
Color.GRAY + '[DEBUG]: [TargetURL]' + Color.END
)

# debug
self.ENGINE.MESSAGE.print_text(
self.ENGINE.USER_AGENT,
mode='debug',
separator=": ",
header=self.ENGINE.MESSAGE.HEADER + ': ' +
separator=": ", # type: ignore
header=self.ENGINE.MESSAGE.HEADER + ': ' + \
Color.GRAY + '[DEBUG]: [UserAgent]' + Color.END
)

Expand All @@ -391,8 +391,8 @@ def search(self, keyword: str, search_type='text', maximum=100):
self.ENGINE.MESSAGE.print_text(
html,
mode='debug',
separator=": ",
header=self.ENGINE.MESSAGE.HEADER + ': ' +
separator=": ", # type: ignore
header=self.ENGINE.MESSAGE.HEADER + ': ' + \
Color.GRAY + '[DEBUG]: [Response]' + Color.END
)

Expand Down Expand Up @@ -425,8 +425,8 @@ def search(self, keyword: str, search_type='text', maximum=100):
# debug
self.ENGINE.MESSAGE.print_text(
html,
mode='debug',
header=self.ENGINE.MESSAGE.HEADER + ': ' + Color.GRAY +
mode='debug', # type: ignore
header=self.ENGINE.MESSAGE.HEADER + ': ' + Color.GRAY + \
'[DEBUG]: [ReCaptchaedResponse]' + Color.END,
separator=": "
)
Expand Down Expand Up @@ -495,7 +495,8 @@ def search(self, keyword: str, search_type='text', maximum=100):

# commandの場合の出力処理
self.ENGINE.MESSAGE.print_text(
'Finally got ' + self.ENGINE.COLOR +
# type: ignore
'Finally got ' + self.ENGINE.COLOR + \
str(len(result)) + Color.END + ' links.',
header=self.ENGINE.MESSAGE.ENGINE,
separator=": ",
Expand Down
31 changes: 17 additions & 14 deletions pydork/engine_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
* SearchEngine Classから呼び出す、各検索エンジンで共通の処理を保持させる継承用Classである `CommonEngine` を持つモジュール.
"""


import requests
import os
import pickle
import time

# selenium driver auto install packages
import chromedriver_autoinstaller
Expand Down Expand Up @@ -372,12 +372,9 @@ def create_selenium_driver(self):
pass
self.driver = Firefox(options=options, firefox_profile=profile)

# NOTE:
# User Agentを確認する場合、↓の処理で実施可能(Chrome/Firefoxともに)。
# ```python
# user_agent = self.driver.execute_script("return navigator.userAgent")
# print(user_agent)
# ```
# User agentを指定させる
user_agent = self.driver.execute_script("return navigator.userAgent")
self.set_user_agent(user_agent)

return

Expand Down Expand Up @@ -578,6 +575,12 @@ def get_result(self, url: str, method='GET', data=None):
if self.USE_SELENIUM:
result = self.request_selenium(url, method=method, data=data)

for i in range(0, 10):
self.driver.execute_script(
"window.scrollTo(0,document.body.scrollHeight)"
)
time.sleep(3)

# 優先度2: Splash経由でのアクセス(Seleniumが有効になってない場合はこちら)
elif self.USE_SPLASH:
# create splash url
Expand Down Expand Up @@ -639,17 +642,17 @@ def get_links(self, source_url, html: str, type: str):
# before processing elists
self.MESSAGE.print_text(
','.join(elinks), # type: ignore
header=self.MESSAGE.HEADER + ': ' + Color.BLUE +
header=self.MESSAGE.HEADER + ': ' + Color.BLUE + \
'[BeforeProcessing elinks]' + Color.END,
separator=" :",
mode="debug",
)

# before processing etitles
self.MESSAGE.print_text(
','.join(etitles),
header=self.MESSAGE.HEADER + ': ' +
Color.BLUE + '[BeforeProcessing etitles]' + Color.END,
','.join(etitles), # type: ignore
header=self.MESSAGE.HEADER + ': ' + Color.BLUE + \
'[BeforeProcessing etitles]' + Color.END,
separator=" :",
mode="debug",
)
Expand All @@ -661,16 +664,16 @@ def get_links(self, source_url, html: str, type: str):
# after processing elists
self.MESSAGE.print_text(
','.join(elinks), # type: ignore
header=self.MESSAGE.HEADER + ': ' +
header=self.MESSAGE.HEADER + ': ' + \
Color.GREEN + '[AfterProcessing elinks]' + Color.END,
separator=" :",
mode="debug",
)

# after processing etitles
self.MESSAGE.print_text(
','.join(etitles),
header=self.MESSAGE.HEADER + ': ' +
','.join(etitles), # type: ignore
header=self.MESSAGE.HEADER + ': ' + \
Color.GREEN + '[AfterProcessing etitles]' + Color.END,
separator=" :",
mode="debug",
Expand Down
60 changes: 15 additions & 45 deletions pydork/engine_google.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
* Google用の検索用Classを持つモジュール.
"""

# import sys

import json
import os
Expand All @@ -18,7 +19,7 @@
from json.decoder import JSONDecodeError
from urllib import parse
from lxml import etree
from bs4 import BeautifulSoup
# from bs4 import BeautifulSoup

from .common import Color
from .recaptcha import TwoCaptcha
Expand Down Expand Up @@ -77,12 +78,10 @@ def gen_search_url(self, keyword: str, type: str):
# 検索パラメータの設定
url_param = {
'q': keyword, # 検索キーワード
'oq': keyword, # 検索キーワード
'num': '100', # 1ページごとの表示件数.
'filter': '0', # 類似ページのフィルタリング(0...無効, 1...有効)
'start': '', # 開始位置
'tbs': '', # 期間
'nfpr': '1' # もしかして検索(Escape hatch)を無効化
'oq': keyword, # 検索キーワード
'num': 100, # 1ページごとの表示件数.
'filter': 0, # 類似ページのフィルタリング(0...無効, 1...有効)
'nfpr': 1 # もしかして検索(Escape hatch)を無効化
}

# lang/localeが設定されている場合
Expand All @@ -107,17 +106,11 @@ def gen_search_url(self, keyword: str, type: str):

page = 0
while True:
if page == 0:
# parameterにページを開始する番号を指定
url_param['start'] = str(page * 100)
params = parse.urlencode(url_param)

target_url = search_url + '?' + params
# parameterにページを開始する番号を指定
url_param['start'] = str(page * 100)
params = parse.urlencode(url_param)

else:
target_url = self.SEARCH_NEXT_URL
if self.SEARCH_NEXT_URL is None:
break
target_url = search_url + '?' + params

yield 'GET', target_url, None
page += 1
Expand Down Expand Up @@ -209,20 +202,20 @@ def get_links(self, url: str, html: str, type: str):

# Selenium経由、かつFirefoxを使っている場合
if self.USE_SELENIUM:
self.SOUP_SELECT_URL = '.yuRUbf > a'
self.SOUP_SELECT_TITLE = '.yuRUbf > a > .LC20lb'
self.SOUP_SELECT_URL = '.yuRUbf > div > a'
self.SOUP_SELECT_TITLE = '.yuRUbf > div > a > .LC20lb'
self.SOUP_SELECT_TEXT = '.lEBKkf'
self.SOUP_SELECT_NEXT_URL = '.d6cvqb > a'
self.SOUP_SELECT_NEXT_URL = '.AaVjTc > tbody > tr > td > a'

# Splash経由で通信している場合
elif self.USE_SPLASH:
self.SOUP_SELECT_URL = '.yuRUbf > a'
self.SOUP_SELECT_TITLE = '.yuRUbf > a > .LC20lb'
self.SOUP_SELECT_TEXT = '.lEBKkf'
self.SOUP_SELECT_NEXT_URL = '.d6cvqb > a'
self.SOUP_SELECT_NEXT_URL = '.AaVjTc > tbody > tr > td > a'

# TODO: SEARCH_NEXT_URLを書き換える
self.get_nextpage_url(html)
# self.get_nextpage_url(html)

# CommonEngineの処理を呼び出す
links = super().get_links(url, html, type)
Expand Down Expand Up @@ -308,29 +301,6 @@ def get_suggest_list(self, suggests: list, char: str, html: str):

return suggests

def get_nextpage_url(self, html: str):
# BeautifulSoupでの解析を実施
soup = BeautifulSoup(html, 'lxml')

# BeautifulSoupでnext urlの要素を確認する
elements = soup.select(self.SOUP_SELECT_NEXT_URL)

# next urlを取得する
elinks = [e['href'] for e in elements]

if len(elinks) == 0:
self.SEARCH_NEXT_URL = None

elif len(elinks) == 1:
next_url = parse.urljoin(
self.ENGINE_TOP_URL, elinks[0]) # type: ignore
self.SEARCH_NEXT_URL = next_url

elif len(elinks) > 1:
next_url = parse.urljoin(
self.ENGINE_TOP_URL, elinks[1]) # type: ignore
self.SEARCH_NEXT_URL = next_url

def processings_elist(self, elinks, etitles, etexts: list):
"""processings_elist
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ def get_completefile_install_location(shell):


name = 'pydork'
version = '1.1.5'
release = '1.1.5'
version = '1.1.6'
release = '1.1.6'

if __name__ == "__main__":
setuptools.setup(
Expand Down

0 comments on commit 733087b

Please sign in to comment.