Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Version 1.1.6 #19

Merged
merged 4 commits into from
Sep 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions pydork/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,17 +369,17 @@ def search(self, keyword: str, search_type='text', maximum=100):
self.ENGINE.MESSAGE.print_text(
url,
mode='debug',
separator=": ",
header=self.ENGINE.MESSAGE.HEADER + ': ' +
separator=": ", # type: ignore
header=self.ENGINE.MESSAGE.HEADER + ': ' + \
Color.GRAY + '[DEBUG]: [TargetURL]' + Color.END
)

# debug
self.ENGINE.MESSAGE.print_text(
self.ENGINE.USER_AGENT,
mode='debug',
separator=": ",
header=self.ENGINE.MESSAGE.HEADER + ': ' +
separator=": ", # type: ignore
header=self.ENGINE.MESSAGE.HEADER + ': ' + \
Color.GRAY + '[DEBUG]: [UserAgent]' + Color.END
)

Expand All @@ -391,8 +391,8 @@ def search(self, keyword: str, search_type='text', maximum=100):
self.ENGINE.MESSAGE.print_text(
html,
mode='debug',
separator=": ",
header=self.ENGINE.MESSAGE.HEADER + ': ' +
separator=": ", # type: ignore
header=self.ENGINE.MESSAGE.HEADER + ': ' + \
Color.GRAY + '[DEBUG]: [Response]' + Color.END
)

Expand Down Expand Up @@ -425,8 +425,8 @@ def search(self, keyword: str, search_type='text', maximum=100):
# debug
self.ENGINE.MESSAGE.print_text(
html,
mode='debug',
header=self.ENGINE.MESSAGE.HEADER + ': ' + Color.GRAY +
mode='debug', # type: ignore
header=self.ENGINE.MESSAGE.HEADER + ': ' + Color.GRAY + \
'[DEBUG]: [ReCaptchaedResponse]' + Color.END,
separator=": "
)
Expand Down Expand Up @@ -495,7 +495,8 @@ def search(self, keyword: str, search_type='text', maximum=100):

# commandの場合の出力処理
self.ENGINE.MESSAGE.print_text(
'Finally got ' + self.ENGINE.COLOR +
# type: ignore
'Finally got ' + self.ENGINE.COLOR + \
str(len(result)) + Color.END + ' links.',
header=self.ENGINE.MESSAGE.ENGINE,
separator=": ",
Expand Down
31 changes: 17 additions & 14 deletions pydork/engine_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
* SearchEngine Classから呼び出す、各検索エンジンで共通の処理を保持させる継承用Classである `CommonEngine` を持つモジュール.
"""


import requests
import os
import pickle
import time

# selenium driver auto install packages
import chromedriver_autoinstaller
Expand Down Expand Up @@ -372,12 +372,9 @@ def create_selenium_driver(self):
pass
self.driver = Firefox(options=options, firefox_profile=profile)

# NOTE:
# User Agentを確認する場合、↓の処理で実施可能(Chrome/Firefoxともに)。
# ```python
# user_agent = self.driver.execute_script("return navigator.userAgent")
# print(user_agent)
# ```
# User agentを指定させる
user_agent = self.driver.execute_script("return navigator.userAgent")
self.set_user_agent(user_agent)

return

Expand Down Expand Up @@ -578,6 +575,12 @@ def get_result(self, url: str, method='GET', data=None):
if self.USE_SELENIUM:
result = self.request_selenium(url, method=method, data=data)

for i in range(0, 10):
self.driver.execute_script(
"window.scrollTo(0,document.body.scrollHeight)"
)
time.sleep(3)

# 優先度2: Splash経由でのアクセス(Seleniumが有効になってない場合はこちら)
elif self.USE_SPLASH:
# create splash url
Expand Down Expand Up @@ -639,17 +642,17 @@ def get_links(self, source_url, html: str, type: str):
# before processing elists
self.MESSAGE.print_text(
','.join(elinks), # type: ignore
header=self.MESSAGE.HEADER + ': ' + Color.BLUE +
header=self.MESSAGE.HEADER + ': ' + Color.BLUE + \
'[BeforeProcessing elinks]' + Color.END,
separator=" :",
mode="debug",
)

# before processing etitles
self.MESSAGE.print_text(
','.join(etitles),
header=self.MESSAGE.HEADER + ': ' +
Color.BLUE + '[BeforeProcessing etitles]' + Color.END,
','.join(etitles), # type: ignore
header=self.MESSAGE.HEADER + ': ' + Color.BLUE + \
'[BeforeProcessing etitles]' + Color.END,
separator=" :",
mode="debug",
)
Expand All @@ -661,16 +664,16 @@ def get_links(self, source_url, html: str, type: str):
# after processing elists
self.MESSAGE.print_text(
','.join(elinks), # type: ignore
header=self.MESSAGE.HEADER + ': ' +
header=self.MESSAGE.HEADER + ': ' + \
Color.GREEN + '[AfterProcessing elinks]' + Color.END,
separator=" :",
mode="debug",
)

# after processing etitles
self.MESSAGE.print_text(
','.join(etitles),
header=self.MESSAGE.HEADER + ': ' +
','.join(etitles), # type: ignore
header=self.MESSAGE.HEADER + ': ' + \
Color.GREEN + '[AfterProcessing etitles]' + Color.END,
separator=" :",
mode="debug",
Expand Down
60 changes: 15 additions & 45 deletions pydork/engine_google.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
* Google用の検索用Classを持つモジュール.
"""

# import sys

import json
import os
Expand All @@ -18,7 +19,7 @@
from json.decoder import JSONDecodeError
from urllib import parse
from lxml import etree
from bs4 import BeautifulSoup
# from bs4 import BeautifulSoup

from .common import Color
from .recaptcha import TwoCaptcha
Expand Down Expand Up @@ -77,12 +78,10 @@ def gen_search_url(self, keyword: str, type: str):
# 検索パラメータの設定
url_param = {
'q': keyword, # 検索キーワード
'oq': keyword, # 検索キーワード
'num': '100', # 1ページごとの表示件数.
'filter': '0', # 類似ページのフィルタリング(0...無効, 1...有効)
'start': '', # 開始位置
'tbs': '', # 期間
'nfpr': '1' # もしかして検索(Escape hatch)を無効化
'oq': keyword, # 検索キーワード
'num': 100, # 1ページごとの表示件数.
'filter': 0, # 類似ページのフィルタリング(0...無効, 1...有効)
'nfpr': 1 # もしかして検索(Escape hatch)を無効化
}

# lang/localeが設定されている場合
Expand All @@ -107,17 +106,11 @@ def gen_search_url(self, keyword: str, type: str):

page = 0
while True:
if page == 0:
# parameterにページを開始する番号を指定
url_param['start'] = str(page * 100)
params = parse.urlencode(url_param)

target_url = search_url + '?' + params
# parameterにページを開始する番号を指定
url_param['start'] = str(page * 100)
params = parse.urlencode(url_param)

else:
target_url = self.SEARCH_NEXT_URL
if self.SEARCH_NEXT_URL is None:
break
target_url = search_url + '?' + params

yield 'GET', target_url, None
page += 1
Expand Down Expand Up @@ -209,20 +202,20 @@ def get_links(self, url: str, html: str, type: str):

# Selenium経由、かつFirefoxを使っている場合
if self.USE_SELENIUM:
self.SOUP_SELECT_URL = '.yuRUbf > a'
self.SOUP_SELECT_TITLE = '.yuRUbf > a > .LC20lb'
self.SOUP_SELECT_URL = '.yuRUbf > div > a'
self.SOUP_SELECT_TITLE = '.yuRUbf > div > a > .LC20lb'
self.SOUP_SELECT_TEXT = '.lEBKkf'
self.SOUP_SELECT_NEXT_URL = '.d6cvqb > a'
self.SOUP_SELECT_NEXT_URL = '.AaVjTc > tbody > tr > td > a'

# Splash経由で通信している場合
elif self.USE_SPLASH:
self.SOUP_SELECT_URL = '.yuRUbf > a'
self.SOUP_SELECT_TITLE = '.yuRUbf > a > .LC20lb'
self.SOUP_SELECT_TEXT = '.lEBKkf'
self.SOUP_SELECT_NEXT_URL = '.d6cvqb > a'
self.SOUP_SELECT_NEXT_URL = '.AaVjTc > tbody > tr > td > a'

# TODO: SEARCH_NEXT_URLを書き換える
self.get_nextpage_url(html)
# self.get_nextpage_url(html)

# CommonEngineの処理を呼び出す
links = super().get_links(url, html, type)
Expand Down Expand Up @@ -308,29 +301,6 @@ def get_suggest_list(self, suggests: list, char: str, html: str):

return suggests

def get_nextpage_url(self, html: str):
# BeautifulSoupでの解析を実施
soup = BeautifulSoup(html, 'lxml')

# BeautifulSoupでnext urlの要素を確認する
elements = soup.select(self.SOUP_SELECT_NEXT_URL)

# next urlを取得する
elinks = [e['href'] for e in elements]

if len(elinks) == 0:
self.SEARCH_NEXT_URL = None

elif len(elinks) == 1:
next_url = parse.urljoin(
self.ENGINE_TOP_URL, elinks[0]) # type: ignore
self.SEARCH_NEXT_URL = next_url

elif len(elinks) > 1:
next_url = parse.urljoin(
self.ENGINE_TOP_URL, elinks[1]) # type: ignore
self.SEARCH_NEXT_URL = next_url

def processings_elist(self, elinks, etitles, etexts: list):
"""processings_elist

Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ def get_completefile_install_location(shell):


name = 'pydork'
version = '1.1.5'
release = '1.1.5'
version = '1.1.6'
release = '1.1.6'

if __name__ == "__main__":
setuptools.setup(
Expand Down
Loading