|
| 1 | +import asyncio |
| 2 | +import dataclasses |
| 3 | +import json |
| 4 | +import logging |
| 5 | +import time |
| 6 | +from dataclasses import dataclass |
| 7 | +from pathlib import Path |
| 8 | +from tempfile import mkdtemp |
| 9 | +from typing import Optional |
| 10 | + |
| 11 | +import nodriver |
| 12 | + |
| 13 | +logger = logging.getLogger('extractor') |
| 14 | + |
| 15 | + |
| 16 | +@dataclass |
| 17 | +class TokenInfo: |
| 18 | + updated: int |
| 19 | + potoken: str |
| 20 | + visitor_data: str |
| 21 | + |
| 22 | + def to_json(self) -> str: |
| 23 | + as_dict = dataclasses.asdict(self) |
| 24 | + as_json = json.dumps(as_dict) |
| 25 | + return as_json |
| 26 | + |
| 27 | + |
| 28 | +class PotokenExtractor: |
| 29 | + |
| 30 | + def __init__(self, loop: asyncio.AbstractEventLoop, |
| 31 | + update_interval: float = 3600, |
| 32 | + browser_path: Optional[Path] = None) -> None: |
| 33 | + self.update_interval: float = update_interval |
| 34 | + self.browser_path: Optional[Path] = browser_path |
| 35 | + self.profile_path = mkdtemp() # cleaned up on exit by nodriver |
| 36 | + self._loop = loop |
| 37 | + self._token_info: Optional[TokenInfo] = None |
| 38 | + self._ongoing_update: asyncio.Lock = asyncio.Lock() |
| 39 | + self._extraction_done: asyncio.Event = asyncio.Event() |
| 40 | + self._update_requested: asyncio.Event = asyncio.Event() |
| 41 | + |
| 42 | + def get(self) -> Optional[TokenInfo]: |
| 43 | + return self._token_info |
| 44 | + |
| 45 | + async def run_once(self) -> Optional[TokenInfo]: |
| 46 | + await self._update() |
| 47 | + return self.get() |
| 48 | + |
| 49 | + async def run(self) -> None: |
| 50 | + await self._update() |
| 51 | + while True: |
| 52 | + try: |
| 53 | + await asyncio.wait_for(self._update_requested.wait(), timeout=self.update_interval) |
| 54 | + logger.debug('initiating force update') |
| 55 | + except asyncio.TimeoutError: |
| 56 | + logger.debug('initiating scheduled update') |
| 57 | + await self._update() |
| 58 | + self._update_requested.clear() |
| 59 | + |
| 60 | + def request_update(self) -> bool: |
| 61 | + """Request immediate update, return False if update request is already set""" |
| 62 | + if self._ongoing_update.locked(): |
| 63 | + logger.debug('update process is already running') |
| 64 | + return False |
| 65 | + if self._update_requested.is_set(): |
| 66 | + logger.debug('force update has already been requested') |
| 67 | + return False |
| 68 | + self._loop.call_soon_threadsafe(self._update_requested.set) |
| 69 | + logger.debug('force update requested') |
| 70 | + return True |
| 71 | + |
| 72 | + @staticmethod |
| 73 | + def _extract_token(request: nodriver.cdp.network.Request) -> Optional[TokenInfo]: |
| 74 | + post_data = request.post_data |
| 75 | + try: |
| 76 | + post_data_json = json.loads(post_data) |
| 77 | + visitor_data = post_data_json['context']['client']['visitorData'] |
| 78 | + potoken = post_data_json['serviceIntegrityDimensions']['poToken'] |
| 79 | + except (json.JSONDecodeError, TypeError, KeyError) as e: |
| 80 | + logger.warning(f'failed to extract token from request: {type(e)}, {e}') |
| 81 | + return None |
| 82 | + token_info = TokenInfo( |
| 83 | + updated=int(time.time()), |
| 84 | + potoken=potoken, |
| 85 | + visitor_data=visitor_data |
| 86 | + ) |
| 87 | + return token_info |
| 88 | + |
| 89 | + async def _update(self) -> None: |
| 90 | + try: |
| 91 | + await asyncio.wait_for(self._perform_update(), timeout=600) |
| 92 | + except asyncio.TimeoutError: |
| 93 | + logger.error('update failed: hard limit timeout exceeded. Browser might be failing to start properly') |
| 94 | + |
| 95 | + async def _perform_update(self) -> None: |
| 96 | + if self._ongoing_update.locked(): |
| 97 | + logger.debug('update is already in progress') |
| 98 | + return |
| 99 | + |
| 100 | + async with self._ongoing_update: |
| 101 | + logger.info('update started') |
| 102 | + self._extraction_done.clear() |
| 103 | + try: |
| 104 | + browser = await nodriver.start(headless=False, |
| 105 | + browser_executable_path=self.browser_path, |
| 106 | + user_data_dir=self.profile_path) |
| 107 | + except FileNotFoundError as e: |
| 108 | + msg = "could not find Chromium. Make sure it's installed or provide direct path to the executable" |
| 109 | + raise FileNotFoundError(msg) from e |
| 110 | + tab = browser.main_tab |
| 111 | + tab.add_handler(nodriver.cdp.network.RequestWillBeSent, self._send_handler) |
| 112 | + await tab.get('https://www.youtube.com/embed/jNQXAC9IVRw') |
| 113 | + player_clicked = await self._click_on_player(tab) |
| 114 | + if player_clicked: |
| 115 | + await self._wait_for_handler() |
| 116 | + await tab.close() |
| 117 | + browser.stop() |
| 118 | + |
| 119 | + @staticmethod |
| 120 | + async def _click_on_player(tab: nodriver.Tab) -> bool: |
| 121 | + try: |
| 122 | + player = await tab.select('#movie_player', 10) |
| 123 | + except asyncio.TimeoutError: |
| 124 | + logger.warning('update failed: unable to locate video player on the page') |
| 125 | + return False |
| 126 | + else: |
| 127 | + await player.click() |
| 128 | + return True |
| 129 | + |
| 130 | + async def _wait_for_handler(self) -> bool: |
| 131 | + try: |
| 132 | + await asyncio.wait_for(self._extraction_done.wait(), timeout=30) |
| 133 | + except asyncio.TimeoutError: |
| 134 | + logger.warning('update failed: timeout waiting for outgoing API request') |
| 135 | + return False |
| 136 | + else: |
| 137 | + logger.info('update was succeessful') |
| 138 | + return True |
| 139 | + |
| 140 | + async def _send_handler(self, event: nodriver.cdp.network.RequestWillBeSent) -> None: |
| 141 | + if not event.request.method == 'POST': |
| 142 | + return |
| 143 | + if '/youtubei/v1/player' not in event.request.url: |
| 144 | + return |
| 145 | + token_info = self._extract_token(event.request) |
| 146 | + if token_info is None: |
| 147 | + return |
| 148 | + logger.info(f'new token: {token_info.to_json()}') |
| 149 | + self._token_info = token_info |
| 150 | + self._extraction_done.set() |
0 commit comments