Skip to content

Commit

Permalink
update.
Browse files Browse the repository at this point in the history
  • Loading branch information
blacknon committed Jul 17, 2023
1 parent 561cec8 commit d95d53d
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 31 deletions.
2 changes: 1 addition & 1 deletion pydork/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def main():
},
{
"args": ["--delete-cookies"],
"type": bool,
"action": "store_true",
"help": messages.help_message_op_delete_cookies,
},
]
Expand Down
37 changes: 22 additions & 15 deletions pydork/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def set_is_debug(self, is_debug: bool):
debug (bool): debug flag(Enable debug with `True`).
"""

self.ENGINE.IS_DEBUG = is_debug
self.ENGINE.IS_DEBUG = is_debug # type: ignore

# commandフラグ(コマンドモードでの実行)を有効化する関数
def set_is_command(self, is_command: bool):
Expand All @@ -125,7 +125,7 @@ def set_is_command(self, is_command: bool):
Args:
is_command (bool): command flag(Enable command mode with `True`).
"""
self.ENGINE.IS_COMMAND = is_command
self.ENGINE.IS_COMMAND = is_command # type: ignore

# color出力が有効か否か
def set_is_color(self, is_color: bool = False):
Expand Down Expand Up @@ -163,7 +163,7 @@ def set_disable_headless(self, disable_headless: bool):
"""

self.ENGINE.IS_DISABLE_HEADLESS = disable_headless
self.ENGINE.IS_DISABLE_HEADLESS = disable_headless # type: ignore

# cookieファイルを入れているディレクトリを渡して、使用するcookieファイルを取得する関数
def set_cookie_files(self, cookie_dir: str):
Expand All @@ -177,8 +177,8 @@ def set_cookie_files(self, cookie_dir: str):
"""

# フルパスに変換
cookie_dir = pathlib.Path(cookie_dir).expanduser()
cookie_dir = pathlib.Path(cookie_dir).resolve()
cookie_dir = pathlib.Path(cookie_dir).expanduser() # type: ignore
cookie_dir = pathlib.Path(cookie_dir).resolve() # type: ignore

# 存在チェックをして、ディレクトリがない場合は新規作成
if not os.path.exists(cookie_dir):
Expand All @@ -205,7 +205,7 @@ def set_cookie_files(self, cookie_dir: str):
open(cookie_file, 'a').close()

# ENGINEのself変数にセットする
self.ENGINE.COOKIE_FILE = cookie_file
self.ENGINE.COOKIE_FILE = cookie_file # type: ignore

# クエリ実行ごとにCookieを削除して作り直しさせるかを指定する関数
def set_cookie_files_delete(self, is_delete_cookie: bool):
Expand All @@ -218,7 +218,7 @@ def set_cookie_files_delete(self, is_delete_cookie: bool):
"""

# ENGINEのself変数にセットする
self.ENGINE.COOKIE_FILE_DELETE = is_delete_cookie
self.ENGINE.COOKIE_FILE_DELETE = is_delete_cookie # type: ignore

# 検索エンジンにわたす言語・国の設定を受け付ける
def set_lang(self, lang: str = "ja", locale: str = "JP"):
Expand Down Expand Up @@ -257,7 +257,7 @@ def set_proxy(self, proxy: str):
self.ENGINE.set_proxy(proxy)

# seleniumを有効にする
def set_selenium(self, uri: str = None, browser: str = None):
def set_selenium(self, uri: str = None, browser: str = None): # type: ignore
"""set_selenium
Use Selenium (priority over Splash).
Expand All @@ -282,7 +282,7 @@ def set_splash(self, splash_url: str):
self.ENGINE.set_splash(splash_url)

# user_agentの設定値を受け付ける
def set_user_agent(self, useragent: str = None):
def set_user_agent(self, useragent: str = None): # type: ignore
"""set_user_agent
Specify the UserAgent.
Expand All @@ -304,7 +304,7 @@ def set_ignore_ssl(self, verify: bool):
Args:
verify (bool): bool.
"""
self.ENGINE.set_ignore_ssl = verify
self.ENGINE.set_ignore_ssl = verify # type: ignore

# 検索を行う
def search(self, keyword: str, search_type='text', maximum=100):
Expand Down Expand Up @@ -384,7 +384,8 @@ def search(self, keyword: str, search_type='text', maximum=100):
)

# 検索結果の取得
html = self.ENGINE.get_result(url, method=method, data=data)
html = self.ENGINE.get_result(
url, method=method, data=data) # type: ignore

# debug
self.ENGINE.MESSAGE.print_text(
Expand All @@ -395,6 +396,9 @@ def search(self, keyword: str, search_type='text', maximum=100):
Color.GRAY + '[DEBUG]: [Response]' + Color.END
)

# 初期値
is_recaptcha = False

while True:
# ReCaptchaページかどうかを識別
if html is not None:
Expand All @@ -414,7 +418,8 @@ def search(self, keyword: str, search_type='text', maximum=100):
# headless browserを使っている場合
if self.ENGINE.USE_SELENIUM or self.ENGINE.USE_SPLASH:
# byass用の関数にわたす
html = self.ENGINE.bypass_recaptcha(url, html)
html = self.ENGINE.bypass_recaptcha(
url, html) # type: ignore

if html is not None:
# debug
Expand Down Expand Up @@ -447,7 +452,8 @@ def search(self, keyword: str, search_type='text', maximum=100):

# TODO: resultも関数に渡して重複チェックを行わせる
# 検索結果をパースしてurlリストを取得する
links = self.ENGINE.get_links(url, html, search_type)
links = self.ENGINE.get_links(
url, html, search_type) # type: ignore

# linksの件数に応じて処理を実施
if not len(links):
Expand All @@ -461,7 +467,7 @@ def search(self, keyword: str, search_type='text', maximum=100):

# loopを抜ける
if self.ENGINE.NAME == "Google":
if self.ENGINE.SEARCH_NEXT_URL is None:
if self.ENGINE.SEARCH_NEXT_URL is None: # type: ignore
break
else:
break
Expand Down Expand Up @@ -548,7 +554,8 @@ def suggest(self, keyword: str, jap=False, alph=False, num=False):
html = self.ENGINE.get_result(url)

# TODO: 各エンジンでjson/textの変換処理を別途実装する必要がある
suggests = self.ENGINE.get_suggest_list(suggests, char, html)
suggests = self.ENGINE.get_suggest_list(
suggests, char, html) # type: ignore

sleep(0.5)

Expand Down
25 changes: 16 additions & 9 deletions pydork/engine_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def set_range(self, start: datetime, end: datetime):
self.RANGE_END = end

# user_agentの設定値を受け付ける(引数がない場合はランダム。Seleniumの際は自動的に使用したbrowserのagentを指定)
def set_user_agent(self, user_agent: str = None, browser: str = None):
def set_user_agent(self, user_agent: str = None, browser: str = None): # type: ignore
"""set_user_agent
user_agentの値を受け付ける.
Expand Down Expand Up @@ -138,7 +138,7 @@ def set_user_agent(self, user_agent: str = None, browser: str = None):
# - splashより優先
# - host, browserは、指定がない場合はそれぞれデフォルト設定(hostは指定なし、browserはchrome)での動作
# - browserは `chrome` or `firefox` のみ受け付ける
def set_selenium(self, uri: str = None, browser: str = None):
def set_selenium(self, uri: str = None, browser: str = None): # type: ignore
"""set_selenium
検索時にSelenium経由で通信を行う.
Expand Down Expand Up @@ -200,6 +200,13 @@ def read_cookies(self):
現時点ではSeleniumでのみ動作.
"""

# cookieファイルが存在しない場合、空ファイルで作成する
exist_cookie_file = os.path.isfile(self.COOKIE_FILE)
if not exist_cookie_file:
cookie_file = open(self.COOKIE_FILE, 'w')
cookie_file.write('')
cookie_file.close()

# cookieファイルのサイズを取得
file_size = os.path.getsize(self.COOKIE_FILE)

Expand All @@ -211,7 +218,7 @@ def read_cookies(self):
# seleniumを使う場合
if self.USE_SELENIUM:
# 事前アクセスが必要になるため、検索対象ドメインのTOPページにアクセスしておく
self.driver.get(self.ENGINE_TOP_URL)
self.driver.get(self.ENGINE_TOP_URL) # type: ignore

# cookieを設定していく
for cookie in cookies:
Expand Down Expand Up @@ -397,7 +404,7 @@ def request_selenium(self, url: str, method='GET', data=None):
EC.presence_of_all_elements_located)

# wait 5 seconds(wait DOM)
if self.NAME in ('Bing', 'Baidu', 'DuckDuckGo'):
if self.NAME in ('Bing', 'Baidu', 'DuckDuckGo'): # type: ignore
self.driver.implicitly_wait(20)

# get result
Expand All @@ -411,7 +418,7 @@ def request_selenium(self, url: str, method='GET', data=None):
EC.presence_of_all_elements_located)

# wait 5 seconds(wait DOM)
if self.NAME in ('Bing', 'Baidu', 'DuckDuckGo'):
if self.NAME in ('Bing', 'Baidu', 'DuckDuckGo'): # type: ignore
self.driver.implicitly_wait(20)

# get result
Expand Down Expand Up @@ -452,7 +459,7 @@ def request_splash(self, url: str, method='GET', data=None):

# NOTE: Googleの画像検索のPOSTがSplashではレンダリングできないので、特例対応でrequestsを使用する.
# TODO: Splashでもレンダリングできるようになったら書き換える.
elif method == 'POST' and self.NAME == 'Google' and self.IMAGE_URL in url:
elif method == 'POST' and self.NAME == 'Google' and self.IMAGE_URL in url: # type: ignore
# create session
session = requests.session()

Expand All @@ -478,7 +485,7 @@ def request_splash(self, url: str, method='GET', data=None):
elif method == 'POST':
headers = {'Content-Type': 'application/json'}
params['http_method'] = 'POST'
params['body'] = parse.urlencode(data)
params['body'] = parse.urlencode(data) # type: ignore

result = self.session.post(
splash_url,
Expand Down Expand Up @@ -631,7 +638,7 @@ def get_links(self, source_url, html: str, type: str):

# before processing elists
self.MESSAGE.print_text(
','.join(elinks),
','.join(elinks), # type: ignore
header=self.MESSAGE.HEADER + ': ' + Color.BLUE +
'[BeforeProcessing elinks]' + Color.END,
separator=" :",
Expand All @@ -653,7 +660,7 @@ def get_links(self, source_url, html: str, type: str):

# after processing elists
self.MESSAGE.print_text(
','.join(elinks),
','.join(elinks), # type: ignore
header=self.MESSAGE.HEADER + ': ' +
Color.GREEN + '[AfterProcessing elinks]' + Color.END,
separator=" :",
Expand Down
8 changes: 5 additions & 3 deletions pydork/engine_google.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ def get_suggest_list(self, suggests: list, char: str, html: str):
sug_data = sug_root.xpath("//suggestion")
data = [s.get("data") for s in sug_data]

suggests[char if char == '' else char[-1]] = data
suggests[char if char == '' else char[-1]] = data # type: ignore

return suggests

Expand All @@ -322,11 +322,13 @@ def get_nextpage_url(self, html: str):
self.SEARCH_NEXT_URL = None

elif len(elinks) == 1:
next_url = parse.urljoin(self.ENGINE_TOP_URL, elinks[0])
next_url = parse.urljoin(
self.ENGINE_TOP_URL, elinks[0]) # type: ignore
self.SEARCH_NEXT_URL = next_url

elif len(elinks) > 1:
next_url = parse.urljoin(self.ENGINE_TOP_URL, elinks[1])
next_url = parse.urljoin(
self.ENGINE_TOP_URL, elinks[1]) # type: ignore
self.SEARCH_NEXT_URL = next_url

def processings_elist(self, elinks, etitles, etexts: list):
Expand Down
7 changes: 4 additions & 3 deletions pydork/engine_yahoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,10 +188,11 @@ def get_links(self, url: str, html: str, type: str):
if self.IS_DEBUG:
print(Color.PURPLE + '[JsonElement]' + Color.END,
file=sys.stderr)
print(Color.PURPLE + element + Color.END, file=sys.stderr)
print(Color.PURPLE + element + Color.END,
file=sys.stderr) # type: ignore

# jsonからデータを抽出 
j = json.loads(element)
j = json.loads(element) # type: ignore

# debug
if self.IS_DEBUG:
Expand Down Expand Up @@ -272,7 +273,7 @@ def get_suggest_list(self, suggests: list, char: str, html: str):
soup = BeautifulSoup(html, features="lxml")
html = soup.find("pre").text
data = json.loads(html)
suggests[char if char == '' else char[-1]] = [e['key']
suggests[char if char == '' else char[-1]] = [e['key'] # type: ignore
for e in data['gossip']['results']]

return suggests
Expand Down

0 comments on commit d95d53d

Please sign in to comment.