2022.11.05 (01. 링크 애니 요청 페이지 일부 수정 및 일부 구현 저장용)
This commit is contained in:
@@ -144,6 +144,7 @@ class FfmpegQueue(object):
|
||||
# 파일 존재여부 체크
|
||||
print(entity.info)
|
||||
filepath = entity.get_video_filepath()
|
||||
P.logger.debug(f'filepath:: {filepath}')
|
||||
if os.path.exists(filepath):
|
||||
entity.ffmpeg_status_kor = '파일 있음'
|
||||
entity.ffmpeg_percent = 100
|
||||
@@ -162,7 +163,8 @@ class FfmpegQueue(object):
|
||||
P.logger.debug(P)
|
||||
# P.logger.debug(P.system_setting.get("port"))
|
||||
|
||||
ffmpeg = SupportFfmpeg(video_url, os.path.basename(filepath), callback_function=self.callback_function,
|
||||
ffmpeg = SupportFfmpeg(video_url, os.path.basename(str(filepath)),
|
||||
callback_function=self.callback_function,
|
||||
max_pf_count=0, save_path=ToolUtil.make_path(dirname), timeout_minute=60,
|
||||
)
|
||||
#
|
||||
@@ -359,6 +361,7 @@ class FfmpegQueue(object):
|
||||
|
||||
def get_entity_list(self):
|
||||
ret = []
|
||||
P.logger.debug(self)
|
||||
for x in self.entity_list:
|
||||
tmp = x.as_dict()
|
||||
ret.append(tmp)
|
||||
|
||||
617
lib/crawler.py
Normal file
617
lib/crawler.py
Normal file
@@ -0,0 +1,617 @@
|
||||
import asyncio
|
||||
import os
|
||||
import platform
|
||||
import traceback
|
||||
|
||||
import cloudscraper
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
from anime_downloader.lib.util import yommi_timeit
|
||||
|
||||
|
||||
class Crawler:
|
||||
|
||||
def __init__(self):
|
||||
self.session = None
|
||||
self.headers = {
|
||||
# 'authority': 'anilife.live',
|
||||
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||
'accept-language': 'ko-KR,ko;q=0.8',
|
||||
'cache-control': 'no-cache',
|
||||
'cookie': 'SL_G_WPT_TO=ko; SL_GWPT_Show_Hide_tmp=1; SL_wptGlobTipTmp=1; DSR=WQYVukjkxKVYEbpgM0pgMs+awM/br6JyMtbfB4OGMC0XEA+UxUxR1RUgOi1mNMoQB16xIEuqk64iex+/ahi72A==; DCSS=FEC4550B310816E1CA91CBE4A0069C43E04F108; SPSI=faccf9a99dee9625af1c93607c2be678; SPSE=j3smljSGgZcayyKDFoQKk5/tnnUnFHa9FzCrL6GOkRwsET506JX0hAvzye3rEobnKfHiir8mAw8z7/KG11QQXw==; anilife_csrf=f30c66ba689880e9710a85b1945ad798; UTGv2=h4a5ce301324340f0b03d9e61e42bc6c0416; spcsrf=77a4e9c38c8e7392b7a36818071a5e3e; sp_lit=acrE8Wfvo4cd6GxQyGoytg==; PRLST=RT; adOtr=fZaBf9c9aed',
|
||||
# 'pragma': 'no-cache',
|
||||
'referer': 'https://anilife.live/g/l?id=afb8c5e4-1720-4f3d-a6b1-27e7473dc6fb',
|
||||
# 'sec-fetch-dest': 'document',
|
||||
# 'sec-fetch-mode': 'navigate',
|
||||
# 'sec-fetch-site': 'same-origin',
|
||||
# 'sec-fetch-user': '?1',
|
||||
# 'sec-gpc': '1',
|
||||
# 'upgrade-insecure-requests': '1',
|
||||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
|
||||
}
|
||||
self.origin_url = ''
|
||||
self.episode_url = None
|
||||
self.OS_PLATFORM = platform.system()
|
||||
|
||||
def get_html_requests(
|
||||
self,
|
||||
url: str, referer: str = None, stream: str = False, timeout: int = 5
|
||||
) -> str:
|
||||
data = ""
|
||||
try:
|
||||
print("get_html_requests ==================")
|
||||
|
||||
# cj = browser_cookie3.chrome(domain_name="anilife.live")
|
||||
referer = "https://anilife.live/"
|
||||
|
||||
if self.session is None:
|
||||
self.session = requests.session()
|
||||
|
||||
# logger.debug('get_html :%s', url)
|
||||
self.headers["Referer"] = "" if referer is None else referer
|
||||
self.headers[
|
||||
"Cookie"
|
||||
] = "_ga=GA1.1.578607927.1660813724; __gads=ID=10abb8b98b6828ae-2281c943a9d500fd:T=1660813741:RT=1660813741:S=ALNI_MYU_iB2lBgSrEQUBwhKpNsToaqQ8A; sbtsck=javuwDzcOJqUyweM1OQeNGzHbjoHp7Cgw44XnPdM738c3E=; SPSI=e48379959d54a6a62cc7abdcafdb2761; SPSE=h5HfMGLJzLqzNafMD3YaOvHSC9xfh77CcWdKvexp/z5N5OsTkIiYSCudQhFffEfk/0pcOTVf0DpeV0RoNopzig==; anilife_csrf=b93b9f25a12a51cf185805ec4de7cf9d; UTGv2=h46b326af644f4ac5d0eb1502881136b3750; __gpi=UID=000008ba227e99e0:T=1660813741:RT=1660912282:S=ALNI_MaJHIVJIGpQ5nTE9lvypKQxJnn10A; DSR=SXPX8ELcRgh6N/9rNgjpQoNfaX2DRceeKYR0/ul7qTI9gApWQpZxr8jgymf/r0HsUT551vtOv2CMWpIn0Hd26A==; DCSS=89508000A76BBD939F6DDACE5BD9EB902D2212A; DGCC=Wdm; adOtr=7L4Xe58995d; spcsrf=6554fa003bf6a46dd9b7417acfacc20a; _ga_56VYJJ7FTM=GS1.1.1660912281.10.1.1660912576.0.0.0; PRLST=EO"
|
||||
|
||||
self.headers["Referer"] = referer
|
||||
|
||||
page_content = self.session.get(
|
||||
url, headers=self.headers, timeout=timeout, allow_redirects=True
|
||||
)
|
||||
data = page_content.text
|
||||
except Exception as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
return data
|
||||
|
||||
async def get_html_playwright(
|
||||
self,
|
||||
url: str,
|
||||
headless: bool = False,
|
||||
referer: str = None,
|
||||
engine: str = "chrome",
|
||||
stealth: bool = False,
|
||||
):
|
||||
try:
|
||||
from playwright.async_api import async_playwright
|
||||
# from playwright.sync_api import sync_playwright
|
||||
|
||||
import time
|
||||
|
||||
print("** playwright ==========================================")
|
||||
|
||||
cookie = None
|
||||
browser_args = [
|
||||
"--window-size=1300,570",
|
||||
"--window-position=000,000",
|
||||
"--disable-dev-shm-usage",
|
||||
"--no-sandbox",
|
||||
"--disable-web-security",
|
||||
"--disable-features=site-per-process",
|
||||
"--disable-setuid-sandbox",
|
||||
"--disable-accelerated-2d-canvas",
|
||||
"--no-first-run",
|
||||
"--no-zygote",
|
||||
# '--single-process',
|
||||
"--disable-gpu",
|
||||
"--use-gl=egl",
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--disable-background-networking",
|
||||
"--enable-features=NetworkService,NetworkServiceInProcess",
|
||||
"--disable-background-timer-throttling",
|
||||
"--disable-backgrounding-occluded-windows",
|
||||
"--disable-breakpad",
|
||||
"--disable-client-side-phishing-detection",
|
||||
"--disable-component-extensions-with-background-pages",
|
||||
"--disable-default-apps",
|
||||
"--disable-extensions",
|
||||
"--disable-features=Translate",
|
||||
"--disable-hang-monitor",
|
||||
"--disable-ipc-flooding-protection",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-prompt-on-repost",
|
||||
"--disable-renderer-backgrounding",
|
||||
"--disable-sync",
|
||||
"--force-color-profile=srgb",
|
||||
"--metrics-recording-only",
|
||||
"--enable-automation",
|
||||
"--password-store=basic",
|
||||
"--use-mock-keychain",
|
||||
"--hide-scrollbars",
|
||||
"--mute-audio",
|
||||
]
|
||||
browser_args = []
|
||||
browser = None
|
||||
# scraper = cloudscraper.create_scraper(
|
||||
# browser={"browser": "chrome", "platform": "windows", "desktop": True},
|
||||
# debug=False,
|
||||
# # sess=LogicAniLife.session,
|
||||
# delay=10,
|
||||
# )
|
||||
#
|
||||
# cookie_value, user_agent = scraper.get_cookie_string(url)
|
||||
#
|
||||
# logger.debug(f"cookie_value:: {cookie_value}")
|
||||
|
||||
start = time.time()
|
||||
ua = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/69.0.3497.100 Safari/537.36"
|
||||
)
|
||||
|
||||
# from playwright_stealth import stealth_sync
|
||||
|
||||
# def set_cookie(req):
|
||||
# nonlocal cookie
|
||||
# if "cookie" in req.headers:
|
||||
# cookie = req.headers["cookie"]
|
||||
|
||||
# headless = True
|
||||
|
||||
logger.info(engine)
|
||||
async with async_playwright() as p:
|
||||
browser = None
|
||||
try:
|
||||
if engine == "chrome":
|
||||
browser = await p.chromium.launch(
|
||||
channel="chrome", args=browser_args, headless=headless
|
||||
)
|
||||
print(engine)
|
||||
|
||||
# browser = await p.chromium.connect('http://192.168.0.2:14444')
|
||||
elif engine == "webkit":
|
||||
browser = await p.webkit.launch(
|
||||
headless=headless,
|
||||
args=browser_args,
|
||||
)
|
||||
else:
|
||||
print('here')
|
||||
browser = await p.firefox.launch(
|
||||
headless=headless,
|
||||
args=browser_args,
|
||||
)
|
||||
|
||||
# context = browser.new_context(
|
||||
# user_agent=ua,
|
||||
# )
|
||||
|
||||
# LogicAniLife.headers[
|
||||
# "Referer"
|
||||
# ] = "https://anilife.live/detail/id/471"
|
||||
# print(LogicAniLife.headers)
|
||||
|
||||
self.headers["Referer"] = self.episode_url
|
||||
|
||||
if referer is not None:
|
||||
self.headers["Referer"] = referer
|
||||
|
||||
logger.debug(f"self.headers {self.headers}")
|
||||
context = await browser.new_context(
|
||||
extra_http_headers=self.headers
|
||||
)
|
||||
# await context.add_cookies(self.cookies)
|
||||
|
||||
# self.headers["Cookie"] = cookie_value
|
||||
|
||||
# context.set_extra_http_headers(self.headers)
|
||||
print('here1')
|
||||
page = await context.new_page()
|
||||
|
||||
# page.set_extra_http_headers(self.headers)
|
||||
|
||||
# if stealth:
|
||||
# await stealth_async(page)
|
||||
|
||||
# page.on("request", set_cookie)
|
||||
# stealth_sync(page)
|
||||
print(self.headers["Referer"])
|
||||
|
||||
# page.on("request", set_cookie)
|
||||
|
||||
print(f'Referer:: {self.headers["Referer"]}')
|
||||
|
||||
# await page.set_extra_http_headers(self.headers)
|
||||
|
||||
await page.goto(
|
||||
url, wait_until="load", referer=self.headers["Referer"]
|
||||
)
|
||||
# page.wait_for_timeout(10000)
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# await page.reload()
|
||||
|
||||
# time.sleep(10)
|
||||
# cookies = context.cookies
|
||||
# print(cookies)
|
||||
|
||||
print(f"page.url:: {page.url}")
|
||||
self.origin_url = page.url
|
||||
|
||||
print(await page.content())
|
||||
|
||||
print(f"run at {time.time() - start} sec")
|
||||
|
||||
return await page.content()
|
||||
except Exception as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
finally:
|
||||
# browser.close()
|
||||
pass
|
||||
|
||||
def get_html_playwright_sync(
|
||||
self,
|
||||
url: str,
|
||||
headless: bool = False,
|
||||
referer: str = None,
|
||||
engine: str = "chrome",
|
||||
stealth: bool = False,
|
||||
) -> str:
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
import time
|
||||
|
||||
print("playwright ==========================================")
|
||||
|
||||
cookie = None
|
||||
browser_args = [
|
||||
"--window-size=1300,570",
|
||||
"--window-position=000,000",
|
||||
"--disable-dev-shm-usage",
|
||||
"--no-sandbox",
|
||||
"--disable-web-security",
|
||||
"--disable-features=site-per-process",
|
||||
"--disable-setuid-sandbox",
|
||||
"--disable-accelerated-2d-canvas",
|
||||
"--no-first-run",
|
||||
"--no-zygote",
|
||||
# '--single-process',
|
||||
"--disable-gpu",
|
||||
"--use-gl=egl",
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--disable-background-networking",
|
||||
"--enable-features=NetworkService,NetworkServiceInProcess",
|
||||
"--disable-background-timer-throttling",
|
||||
"--disable-backgrounding-occluded-windows",
|
||||
"--disable-breakpad",
|
||||
"--disable-client-side-phishing-detection",
|
||||
"--disable-component-extensions-with-background-pages",
|
||||
"--disable-default-apps",
|
||||
"--disable-extensions",
|
||||
"--disable-features=Translate",
|
||||
"--disable-hang-monitor",
|
||||
"--disable-ipc-flooding-protection",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-prompt-on-repost",
|
||||
"--disable-renderer-backgrounding",
|
||||
"--disable-sync",
|
||||
"--force-color-profile=srgb",
|
||||
"--metrics-recording-only",
|
||||
"--enable-automation",
|
||||
"--password-store=basic",
|
||||
"--use-mock-keychain",
|
||||
"--hide-scrollbars",
|
||||
"--mute-audio",
|
||||
]
|
||||
# scraper = cloudscraper.create_scraper(
|
||||
# browser={"browser": "chrome", "platform": "windows", "desktop": True},
|
||||
# debug=False,
|
||||
# # sess=LogicAniLife.session,
|
||||
# delay=10,
|
||||
# )
|
||||
#
|
||||
# cookie_value, user_agent = scraper.get_cookie_string(url)
|
||||
#
|
||||
# logger.debug(f"cookie_value:: {cookie_value}")
|
||||
|
||||
start = time.time()
|
||||
ua = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/69.0.3497.100 Safari/537.36"
|
||||
)
|
||||
|
||||
# from playwright_stealth import stealth_sync
|
||||
|
||||
def set_cookie(req):
|
||||
nonlocal cookie
|
||||
if "cookie" in req.headers:
|
||||
cookie = req.headers["cookie"]
|
||||
|
||||
# headless = True
|
||||
|
||||
with sync_playwright() as p:
|
||||
try:
|
||||
if engine == "chrome":
|
||||
# browser = await p.chromium.launch(
|
||||
# channel="chrome", args=browser_args, headless=headless
|
||||
# )
|
||||
print(engine)
|
||||
|
||||
# browser = p.chromium.connect_over_cdp('http://yommi.duckdns.org:14444')
|
||||
browser = p.chromium.launch(
|
||||
channel="chrome", args=browser_args, headless=headless
|
||||
)
|
||||
|
||||
elif engine == "webkit":
|
||||
browser = p.webkit.launch(
|
||||
headless=headless,
|
||||
args=browser_args,
|
||||
)
|
||||
else:
|
||||
browser = p.firefox.launch(
|
||||
headless=headless,
|
||||
args=browser_args,
|
||||
)
|
||||
# context = browser.new_context(
|
||||
# user_agent=ua,
|
||||
# )
|
||||
|
||||
self.headers[
|
||||
"Referer"
|
||||
] = "https://anilife.live/detail/id/471"
|
||||
# print(self.headers)
|
||||
|
||||
self.headers["Referer"] = self.episode_url
|
||||
|
||||
if referer is not None:
|
||||
self.headers["Referer"] = referer
|
||||
|
||||
logger.debug(f"self.headers::: {self.headers}")
|
||||
context = browser.new_context(
|
||||
extra_http_headers=self.headers
|
||||
)
|
||||
# await context.add_cookies(self.cookies)
|
||||
|
||||
# self.headers["Cookie"] = cookie_value
|
||||
|
||||
# context.set_extra_http_headers(self.headers)
|
||||
|
||||
page = context.new_page()
|
||||
|
||||
# page.set_extra_http_headers(self.headers)
|
||||
|
||||
if stealth:
|
||||
# stealth_async(page)
|
||||
pass
|
||||
|
||||
# page.on("request", set_cookie)
|
||||
# stealth_sync(page)
|
||||
print(self.headers["Referer"])
|
||||
|
||||
page.on("request", set_cookie)
|
||||
|
||||
print(f'Referer:: {self.headers["Referer"]}')
|
||||
# await page.set_extra_http_headers(self.headers)
|
||||
|
||||
page.goto(
|
||||
url, wait_until="load", referer=self.headers["Referer"]
|
||||
)
|
||||
# page.wait_for_timeout(10000)
|
||||
time.sleep(1)
|
||||
|
||||
# await page.reload()
|
||||
|
||||
# cookies = context.cookies
|
||||
# print(cookies)
|
||||
|
||||
print(f"page.url:: {page.url}")
|
||||
self.origin_url = page.url
|
||||
|
||||
print(page.content().encode("utf8"))
|
||||
|
||||
print(f"run at {time.time() - start} sec")
|
||||
|
||||
return page.content()
|
||||
except Exception as e:
|
||||
logger.error("Exception:%s", e)
|
||||
logger.error(traceback.format_exc())
|
||||
finally:
|
||||
browser.close()
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Exception:%s", e)
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
@yommi_timeit
|
||||
def get_html_selenium(self, url: str, referer: str, is_stealth: bool = False,
|
||||
is_headless: bool = False) -> bytes:
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium import webdriver
|
||||
from selenium_stealth import stealth
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
import time
|
||||
|
||||
print("get_html_selenium() ==========================================")
|
||||
|
||||
options = webdriver.ChromeOptions()
|
||||
# 크롬드라이버 헤더 옵션추가 (리눅스에서 실행시 필수)
|
||||
options.add_argument("start-maximized")
|
||||
if is_headless:
|
||||
options.add_argument("--headless")
|
||||
options.add_argument("--no-sandbox")
|
||||
options.add_argument("window-size=1920x1080")
|
||||
options.add_argument("disable-gpu")
|
||||
# options.add_argument('--no-sandbox')
|
||||
options.add_argument("--disable-dev-shm-usage")
|
||||
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||
options.add_experimental_option("detach", True)
|
||||
options.add_experimental_option("useAutomationExtension", False)
|
||||
|
||||
logger.debug(self.OS_PLATFORM)
|
||||
|
||||
driver_bin_path = os.path.join(
|
||||
os.path.dirname(__file__), "bin", self.OS_PLATFORM
|
||||
)
|
||||
|
||||
# 크롬 드라이버 경로
|
||||
driver_path = f"{driver_bin_path}/chromedriver"
|
||||
|
||||
if self.OS_PLATFORM == "Darwin":
|
||||
# driver = webdriver.Chrome(executable_path=driver_path, chrome_options=options)
|
||||
print("here:::::::::")
|
||||
driver = webdriver.Chrome(
|
||||
ChromeDriverManager().install(), chrome_options=options
|
||||
)
|
||||
elif self.OS_PLATFORM == "Linux":
|
||||
driver = webdriver.Chrome(
|
||||
ChromeDriverManager().install(), chrome_options=options
|
||||
)
|
||||
# driver = webdriver.Chrome(executable_path=driver_path, chrome_options=options)
|
||||
|
||||
# driver = webdriver.Remote(command_executor='http://192.168.0.2:14444', options=options)
|
||||
|
||||
else:
|
||||
driver = webdriver.Chrome(
|
||||
executable_path=driver_path, chrome_options=options
|
||||
)
|
||||
|
||||
# is_stealth = True
|
||||
if is_stealth:
|
||||
stealth(
|
||||
driver,
|
||||
languages=["ko-KR", "ko"],
|
||||
vendor="Google Inc.",
|
||||
platform="Win32",
|
||||
webgl_vendor="Intel Inc.",
|
||||
renderer="Intel Iris OpenGL Engine",
|
||||
fix_hairline=True,
|
||||
)
|
||||
|
||||
driver.get(url)
|
||||
|
||||
# time.sleep(1)
|
||||
#
|
||||
# driver.refresh()
|
||||
|
||||
logger.debug(f"current_url:: {driver.current_url}")
|
||||
# logger.debug(f"current_cookie:: {driver.get_cookies()}")
|
||||
cookies_list = driver.get_cookies()
|
||||
|
||||
cookies_dict = {}
|
||||
for cookie in cookies_list:
|
||||
cookies_dict[cookie["name"]] = cookie["value"]
|
||||
|
||||
# logger.debug(cookies_dict)
|
||||
self.cookies = cookies_list
|
||||
# self.headers["Cookie"] = driver.get_cookies()
|
||||
self.episode_url = driver.current_url
|
||||
time.sleep(1)
|
||||
elem = driver.find_element(By.XPATH, "//*")
|
||||
source_code = elem.get_attribute("outerHTML")
|
||||
|
||||
logger.debug(source_code)
|
||||
|
||||
driver.close()
|
||||
|
||||
driver.quit()
|
||||
|
||||
return source_code.encode("utf-8")
|
||||
|
||||
# Create a request interceptor
|
||||
@staticmethod
|
||||
def interceptor(request):
|
||||
del request.headers["Referer"] # Delete the header first
|
||||
request.headers[
|
||||
"Referer"
|
||||
] = "https://anilife.live/g/l?id=0a36917f-39cc-43ea-b0c6-0c86d27c2408"
|
||||
|
||||
@staticmethod
|
||||
def get_html_seleniumwire(url, referer, wired=False):
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from seleniumwire import webdriver as wired_webdriver
|
||||
from selenium_stealth import stealth
|
||||
import time
|
||||
|
||||
options = webdriver.ChromeOptions()
|
||||
# 크롬드라이버 헤더 옵션추가 (리눅스에서 실행시 필수)
|
||||
options.add_argument("start-maximized")
|
||||
options.add_argument("--headless")
|
||||
options.add_argument("--no-sandbox")
|
||||
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||
options.add_experimental_option("useAutomationExtension", False)
|
||||
# 크롬드라이버 경로
|
||||
driver_path = "./bin/Darwin/chromedriver"
|
||||
if wired:
|
||||
driver = wired_webdriver.Chrome(
|
||||
executable_path=driver_path, chrome_options=options
|
||||
)
|
||||
else:
|
||||
driver = webdriver.Chrome(
|
||||
executable_path=driver_path, chrome_options=options
|
||||
)
|
||||
|
||||
# stealth ======================================
|
||||
# stealth(
|
||||
# driver,
|
||||
# languages=["en-US", "en"],
|
||||
# vendor="Google Inc.",
|
||||
# platform="Win32",
|
||||
# webgl_vendor="Intel Inc.",
|
||||
# renderer="Intel Iris OpenGL Engine",
|
||||
# fix_hairline=True,
|
||||
# )
|
||||
if wired:
|
||||
driver.request_interceptor = self.interceptor
|
||||
driver.get(url)
|
||||
driver.refresh()
|
||||
time.sleep(1)
|
||||
elem = driver.find_element(By.XPATH, "//*")
|
||||
source_code = elem.get_attribute("outerHTML")
|
||||
|
||||
return source_code.encode("utf-8")
|
||||
|
||||
@staticmethod
|
||||
def get_html_cloudflare(url, cached=False):
|
||||
# scraper = cloudscraper.create_scraper(
|
||||
# # disableCloudflareV1=True,
|
||||
# # captcha={"provider": "return_response"},
|
||||
# delay=10,
|
||||
# browser="chrome",
|
||||
# )
|
||||
# scraper = cfscrape.create_scraper(
|
||||
# browser={"browser": "chrome", "platform": "android", "desktop": False}
|
||||
# )
|
||||
|
||||
# scraper = cloudscraper.create_scraper(
|
||||
# browser={"browser": "chrome", "platform": "windows", "mobile": False},
|
||||
# debug=True,
|
||||
# )
|
||||
|
||||
# LogicAniLife.headers["referer"] = LogicAniLife.referer
|
||||
self.headers["Referer"] = "https://anilife.live/"
|
||||
self.headers[
|
||||
"Cookie"
|
||||
] = "_ga=GA1.1.578607927.1660813724; __gads=ID=10abb8b98b6828ae-2281c943a9d500fd:T=1660813741:RT=1660813741:S=ALNI_MYU_iB2lBgSrEQUBwhKpNsToaqQ8A; sbtsck=javuwDzcOJqUyweM1OQeNGzHbjoHp7Cgw44XnPdM738c3E=; SPSI=e48379959d54a6a62cc7abdcafdb2761; SPSE=h5HfMGLJzLqzNafMD3YaOvHSC9xfh77CcWdKvexp/z5N5OsTkIiYSCudQhFffEfk/0pcOTVf0DpeV0RoNopzig==; anilife_csrf=b93b9f25a12a51cf185805ec4de7cf9d; UTGv2=h46b326af644f4ac5d0eb1502881136b3750; __gpi=UID=000008ba227e99e0:T=1660813741:RT=1660912282:S=ALNI_MaJHIVJIGpQ5nTE9lvypKQxJnn10A; DSR=SXPX8ELcRgh6N/9rNgjpQoNfaX2DRceeKYR0/ul7qTI9gApWQpZxr8jgymf/r0HsUT551vtOv2CMWpIn0Hd26A==; DCSS=89508000A76BBD939F6DDACE5BD9EB902D2212A; DGCC=Wdm; adOtr=7L4Xe58995d; spcsrf=6554fa003bf6a46dd9b7417acfacc20a; _ga_56VYJJ7FTM=GS1.1.1660912281.10.1.1660912576.0.0.0; PRLST=EO"
|
||||
# logger.debug(f"headers:: {LogicAniLife.headers}")
|
||||
|
||||
if self.session is None:
|
||||
self.session = requests.Session()
|
||||
self.session.headers = self.headers
|
||||
|
||||
# LogicAniLife.session = requests.Session()
|
||||
|
||||
sess = cloudscraper.create_scraper(
|
||||
browser={"browser": "firefox", "platform": "windows", "desktop": True},
|
||||
debug=False,
|
||||
sess=self.session,
|
||||
delay=10,
|
||||
)
|
||||
|
||||
# print(scraper.get(url, headers=LogicAniLife.headers).content)
|
||||
# print(scraper.get(url).content)
|
||||
# return scraper.get(url, headers=LogicAniLife.headers).content
|
||||
# print(LogicAniLife.headers)
|
||||
return sess.get(
|
||||
url, headers=self.session.headers, timeout=10, allow_redirects=True
|
||||
).content.decode("utf8", errors="replace")
|
||||
145
lib/misc.py
Normal file
145
lib/misc.py
Normal file
@@ -0,0 +1,145 @@
|
||||
import asyncio
|
||||
import threading
|
||||
import traceback
|
||||
from asyncio import Task
|
||||
from typing import Awaitable, T
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
def start_cor(loop, url):
|
||||
fut = asyncio.run_coroutine_threadsafe(hello_async(url), loop)
|
||||
print(fut.result())
|
||||
|
||||
|
||||
def _start_background_loop(loop):
|
||||
asyncio.set_event_loop(loop)
|
||||
loop.run_forever()
|
||||
|
||||
|
||||
_LOOP = asyncio.new_event_loop()
|
||||
_LOOP_THREAD = threading.Thread(
|
||||
target=_start_background_loop, args=(_LOOP,), daemon=True
|
||||
)
|
||||
_LOOP_THREAD.start()
|
||||
|
||||
|
||||
# =================================================================#
|
||||
def asyncio_run(future, as_task=True):
|
||||
"""
|
||||
A better implementation of `asyncio.run`.
|
||||
|
||||
:param future: A future or task or call of an async method.
|
||||
:param as_task: Forces the future to be scheduled as task (needed for e.g. aiohttp).
|
||||
"""
|
||||
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
except RuntimeError: # no event loop running:
|
||||
loop = asyncio.new_event_loop()
|
||||
return loop.run_until_complete(_to_task(future, as_task, loop))
|
||||
else:
|
||||
import nest_asyncio
|
||||
nest_asyncio.apply(loop)
|
||||
return asyncio.run(_to_task(future, as_task, loop))
|
||||
|
||||
|
||||
def asyncio_run2(coro: Awaitable[T], timeout=60) -> T:
|
||||
"""
|
||||
Runs the coroutine in an event loop running on a background thread,
|
||||
and blocks the current thread until it returns a result.
|
||||
This plays well with gevent, since it can yield on the Future result call.
|
||||
|
||||
:param coro: A coroutine, typically an async method
|
||||
:param timeout: How many seconds we should wait for a result before raising an error
|
||||
"""
|
||||
try:
|
||||
return asyncio.run_coroutine_threadsafe(coro, _LOOP).result(timeout=timeout)
|
||||
except Exception as e:
|
||||
logger.error("Exception:%s", e)
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
|
||||
def _to_task(future, as_task, loop):
|
||||
if not as_task or isinstance(future, Task):
|
||||
return future
|
||||
return loop.create_task(future)
|
||||
|
||||
|
||||
def hello_sync(url: str):
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
with sync_playwright() as p:
|
||||
# browser = p.chromium.launch()
|
||||
browser = p.chromium.connect_over_cdp('http://192.168.0.2:14444')
|
||||
page = browser.new_page()
|
||||
page.goto(url)
|
||||
print(page.title())
|
||||
browser.close()
|
||||
|
||||
|
||||
async def _test(url):
|
||||
await asyncio.sleep(2)
|
||||
print('_test')
|
||||
return 'ok'
|
||||
|
||||
|
||||
async def compute(x, y):
|
||||
print("Compute %s + %s ..." % (x, y))
|
||||
await asyncio.sleep(1.0)
|
||||
return x + y
|
||||
|
||||
|
||||
async def print_sum(x, y):
|
||||
result = await compute(x, y)
|
||||
print("%s + %s = %s" % (x, y, result))
|
||||
|
||||
|
||||
async def _thread(url, loop):
|
||||
if loop is not None:
|
||||
# future = asyncio.run_coroutine_threadsafe(hello_async(url), loop)
|
||||
# future = asyncio.run_coroutine_threadsafe(_test(url), loop)
|
||||
|
||||
# print(f"Future --")
|
||||
# print(" 2 ")
|
||||
# print(" Result ", future.result())
|
||||
# print(" 3 ")
|
||||
# loop.run_until_complete(print_sum(1, 2))
|
||||
loop.run_until_complete(hello_async(url, loop))
|
||||
print("")
|
||||
|
||||
|
||||
async def hello_async(url: str, loop=None):
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async with async_playwright() as p:
|
||||
print("here")
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
await page.goto(url)
|
||||
print(page.title())
|
||||
|
||||
await browser.close()
|
||||
|
||||
|
||||
async def hello(url: str):
|
||||
from playwright.async_api import async_playwright
|
||||
# from playwright_stealth import stealth_sync, stealth_async
|
||||
print("hi")
|
||||
try:
|
||||
from gevent.event import AsyncResult
|
||||
print(AsyncResult())
|
||||
await asyncio.sleep(2)
|
||||
print("hi")
|
||||
# pw = await async_playwright().start()
|
||||
# print(pw)
|
||||
# browser = await pw.chromium.launch(headless=True)
|
||||
# print("Browser Launched-----------------")
|
||||
# page = await browser.new_page()
|
||||
# print("Browser new Page created ")
|
||||
# await page.goto(url)
|
||||
# LogicAniLife.response_data = await page.content()
|
||||
# return await page.content()
|
||||
except Exception as e:
|
||||
logger.error("Exception:%s", e)
|
||||
logger.error(traceback.format_exc())
|
||||
17
lib/util.py
17
lib/util.py
@@ -4,9 +4,12 @@
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import time
|
||||
import traceback
|
||||
import platform
|
||||
import subprocess
|
||||
from functools import wraps
|
||||
|
||||
# third-party
|
||||
from sqlalchemy.ext.declarative import DeclarativeMeta
|
||||
# sjva 공용
|
||||
@@ -38,7 +41,21 @@ def read_file(filename):
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
|
||||
def yommi_timeit(func):
|
||||
@wraps(func)
|
||||
def timeit_wrapper(*args, **kwargs):
|
||||
start_time = time.perf_counter()
|
||||
result = func(*args, **kwargs)
|
||||
end_time = time.perf_counter()
|
||||
total_time = end_time - start_time
|
||||
print(f'Function {func.__name__}{args} {kwargs} Took {total_time:.4f} seconds')
|
||||
return result
|
||||
|
||||
return timeit_wrapper
|
||||
|
||||
|
||||
class Util(object):
|
||||
|
||||
@staticmethod
|
||||
def change_text_for_use_filename(text):
|
||||
# text = text.replace('/', '')
|
||||
|
||||
Reference in New Issue
Block a user