diff --git a/.gitignore b/.gitignore index c5aed4a..b379466 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,4 @@ MANIFEST # Installer logs pip-log.txt pip-delete-this-directory.txt +cookies.txt diff --git a/README.md b/README.md index a601e3a..eea5e1f 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ grawlix currently supports downloading from the following sources: - [Flipp](https://flipp.dk) - [Internet Archive](https://archive.org) - [Manga Plus](https://mangaplus.shueisha.co.jp) +- [Marvel Unlimited](https://marvel.com) - [Royal Road](https://www.royalroad.com) - [Saxo](https://saxo.com) - [Webtoons](https://webtoons.com) diff --git a/grawlix/__main__.py b/grawlix/__main__.py index 6439a5d..4669b97 100644 --- a/grawlix/__main__.py +++ b/grawlix/__main__.py @@ -8,6 +8,7 @@ from . import arguments, logging from typing import Tuple, Optional from rich.progress import Progress from functools import partial +import os import asyncio @@ -48,6 +49,20 @@ def get_urls(options) -> list[str]: return urls +def get_cookie_file(options) -> Optional[str]: + """ + Get path to cookie file + + :param options: Cli arguments + :returns: Path to cookie file + """ + if options.cookie_file is not None and os.path.exists(options.cookie_file): + return options.cookie_file + if os.path.exists("./cookies.txt"): + return "./cookies.txt" + return None + + async def authenticate(source: Source, config: Config, options): """ Authenticate with source @@ -61,6 +76,10 @@ async def authenticate(source: Source, config: Config, options): username, password, library = get_login(source, config, options) await source.login(username, password, library=library) source.authenticated = True + if source.supports_cookies: + cookie_file = get_cookie_file(options) + if cookie_file: + source.load_cookies(cookie_file) else: raise SourceNotAuthenticated diff --git a/grawlix/arguments.py b/grawlix/arguments.py index 9c145bc..1e5eba9 100644 --- a/grawlix/arguments.py +++ b/grawlix/arguments.py @@ -44,6 +44,12 @@ def parse_arguments() -> argparse.Namespace: help = "Library for login", dest = "library", ) + parser.add_argument( + '-c', + '--cookies', + help = "Path to netscape cookie file", + dest = "cookie_file" + ) # Outputs parser.add_argument( '-o', diff --git a/grawlix/sources/__init__.py b/grawlix/sources/__init__.py index 7e1fbf4..69cc2bf 100644 --- a/grawlix/sources/__init__.py +++ b/grawlix/sources/__init__.py @@ -5,6 +5,7 @@ from .ereolen import Ereolen from .flipp import Flipp from .internet_archive import InternetArchive from .mangaplus import MangaPlus +from .marvel import Marvel from .royal_road import RoyalRoad from .saxo import Saxo from .webtoons import Webtoons @@ -56,6 +57,7 @@ def get_source_classes() -> list[type[Source]]: Flipp, InternetArchive, MangaPlus, + Marvel, RoyalRoad, Saxo, Webtoons diff --git a/grawlix/sources/marvel.py b/grawlix/sources/marvel.py new file mode 100644 index 0000000..d42910f --- /dev/null +++ b/grawlix/sources/marvel.py @@ -0,0 +1,138 @@ +from grawlix.book import Book, Metadata, ImageList, OnlineFile, Series, Result +from grawlix.exceptions import InvalidUrl, DataNotFound + +from .source import Source + +import re + +# Personal marvel ip key +API_KEY = "83ac0da31d3f6801f2c73c7e07ad76e8" + +class Marvel(Source[str]): + name: str = "Marvel" + match = [ + r"https://www.marvel.com/comics/issue/\d+/.+", + r"https://read.marvel.com/#/book/\d+", + r"https://www.marvel.com/comics/series/\d+/.+" + ] + _authentication_methods: list[str] = [ "cookies" ] + + + async def download(self, url: str) -> Result[str]: + match_index = self.get_match_index(url) + if match_index == 0: + issue_id = await self._get_issue_id(url) + return await self.download_book_from_id(issue_id) + if match_index == 1: + issue_id = url.split("/")[-1] + return await self.download_book_from_id(issue_id) + if match_index == 2: + return await self._download_series(url) + raise InvalidUrl + + + async def _download_series(self, url: str) -> Series[str]: + """ + Download series + + :param url: Url of series + :returns: Series data + """ + series_id = url.split("/")[-2] + issue_ids = await self._download_issue_ids(series_id) + metadata = await self._download_series_metadata(series_id) + return Series( + title = metadata["data"]["results"][0]["title"], + book_ids = issue_ids + ) + + + async def _download_issue_ids(self, series_id: str) -> list[str]: + """ + Download issue ids from series + + :param series_id: Id of comic series on marvel.com + :returns: List of comic ids for marvel comics + """ + response = await self._client.get( + f"https://api.marvel.com/browse/comics?byType=comic_series&isDigital=1&limit=10000&byId={series_id}", + ) + issue_ids = [issue["digital_id"] for issue in response.json()["data"]["results"]] + return issue_ids + + + async def _download_series_metadata(self, series_id: str) -> dict: + """ + Download series metadata + + :param series_id: Id of comic series on marvel.com + :returns: Dictionary with metadata + """ + response = await self._client.get( + f"https://gateway.marvel.com:443/v1/public/series/{series_id}?apikey={API_KEY}", + headers = { + "Referer": "https://developer.marvel.com/" + } + ) + return response.json() + + async def _get_issue_id(self, url: str) -> str: + """ + Download issue id from url + + :param url: Url to issue info page + :return: Issue id + """ + response = await self._client.get(url) + search = re.search(r"digital_comic_id: \"(\d+)\"", response.text) + if not search: + raise DataNotFound + return search.group(1) + + + + async def download_book_from_id(self, issue_id: str) -> Book: + return Book( + metadata = await self._download_issue_metadata(issue_id), + data = await self._download_issue_pages(issue_id) + ) + + + async def _download_issue_metadata(self, issue_id: str) -> Metadata: + """ + Download and parse metadata for issue + + :param issue_id: Identifier for issue + :returns: Issue metadata + """ + response = await self._client.get( + f"https://bifrost.marvel.com/v1/catalog/digital-comics/metadata/{issue_id}" + ) + issue_meta = response.json()["data"]["results"][0]["issue_meta"] + return Metadata( + title = issue_meta["title"], + series = issue_meta["series_title"], + publisher = "Marvel", + authors = [c["full_name"] for c in issue_meta["creators"]["extended_list"]] + ) + + + async def _download_issue_pages(self, issue_id: str) -> ImageList: + """ + Download list of page links for issue + + :param issue_id: Identifier for issue + :returns: List of links to comic pages + """ + response = await self._client.get( + f"https://bifrost.marvel.com/v1/catalog/digital-comics/web/assets/{issue_id}" + ) + images = [] + for page in response.json()["data"]["results"][0]["pages"]: + images.append( + OnlineFile( + url = page["assets"]["source"], + extension = "jpg" + ) + ) + return ImageList(images) diff --git a/grawlix/sources/source.py b/grawlix/sources/source.py index 0658cc3..2e2eec7 100644 --- a/grawlix/sources/source.py +++ b/grawlix/sources/source.py @@ -1,5 +1,8 @@ from grawlix.book import Book, Series, Result +from typing import Generic, TypeVar, Tuple, Optional +from http.cookiejar import MozillaCookieJar +import re from typing import Generic, TypeVar, Tuple import httpx @@ -42,6 +45,25 @@ class Source(Generic[T]): raise NotImplementedError + @property + def supports_cookies(self) -> bool: + """Does the source support authentication with cookie file""" + return "cookies" in self._authentication_methods + + + def load_cookies(self, cookie_file: str): + """ + Authenticate with source with netscape cookie file + + :param cookie_file: Path to netscape cookie file + """ + if self.supports_cookies: + cookie_jar = MozillaCookieJar() + cookie_jar.load(cookie_file, ignore_expires=True) + self._client.cookies.update(cookie_jar) + self.authenticated = True + + async def download(self, url: str) -> Result[T]: """ Download book metadata from source @@ -60,3 +82,16 @@ class Source(Generic[T]): :returns: Downloaded book metadata """ raise NotImplementedError + + + def get_match_index(self, url: str) -> Optional[int]: + """ + Find the first regex in `self.match` that matches url + + :param url: Url to match + :returns: Index of regex + """ + for index, match in enumerate(self.match): + if re.match(match, url): + return index + return None