Merge branch 'marvel'

This commit is contained in:
Joakim Holm 2023-05-14 17:40:03 +02:00
commit f064dfa9a6
7 changed files with 202 additions and 0 deletions

1
.gitignore vendored
View File

@ -36,3 +36,4 @@ MANIFEST
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
cookies.txt

View File

@ -12,6 +12,7 @@ grawlix currently supports downloading from the following sources:
- [Flipp](https://flipp.dk)
- [Internet Archive](https://archive.org)
- [Manga Plus](https://mangaplus.shueisha.co.jp)
- [Marvel Unlimited](https://marvel.com)
- [Royal Road](https://www.royalroad.com)
- [Saxo](https://saxo.com)
- [Webtoons](https://webtoons.com)

View File

@ -8,6 +8,7 @@ from . import arguments, logging
from typing import Tuple, Optional
from rich.progress import Progress
from functools import partial
import os
import asyncio
@ -48,6 +49,20 @@ def get_urls(options) -> list[str]:
return urls
def get_cookie_file(options) -> Optional[str]:
"""
Get path to cookie file
:param options: Cli arguments
:returns: Path to cookie file
"""
if options.cookie_file is not None and os.path.exists(options.cookie_file):
return options.cookie_file
if os.path.exists("./cookies.txt"):
return "./cookies.txt"
return None
async def authenticate(source: Source, config: Config, options):
"""
Authenticate with source
@ -61,6 +76,10 @@ async def authenticate(source: Source, config: Config, options):
username, password, library = get_login(source, config, options)
await source.login(username, password, library=library)
source.authenticated = True
if source.supports_cookies:
cookie_file = get_cookie_file(options)
if cookie_file:
source.load_cookies(cookie_file)
else:
raise SourceNotAuthenticated

View File

@ -44,6 +44,12 @@ def parse_arguments() -> argparse.Namespace:
help = "Library for login",
dest = "library",
)
parser.add_argument(
'-c',
'--cookies',
help = "Path to netscape cookie file",
dest = "cookie_file"
)
# Outputs
parser.add_argument(
'-o',

View File

@ -5,6 +5,7 @@ from .ereolen import Ereolen
from .flipp import Flipp
from .internet_archive import InternetArchive
from .mangaplus import MangaPlus
from .marvel import Marvel
from .royal_road import RoyalRoad
from .saxo import Saxo
from .webtoons import Webtoons
@ -56,6 +57,7 @@ def get_source_classes() -> list[type[Source]]:
Flipp,
InternetArchive,
MangaPlus,
Marvel,
RoyalRoad,
Saxo,
Webtoons

138
grawlix/sources/marvel.py Normal file
View File

@ -0,0 +1,138 @@
from grawlix.book import Book, Metadata, ImageList, OnlineFile, Series, Result
from grawlix.exceptions import InvalidUrl, DataNotFound
from .source import Source
import re
# Personal marvel ip key
API_KEY = "83ac0da31d3f6801f2c73c7e07ad76e8"
class Marvel(Source[str]):
name: str = "Marvel"
match = [
r"https://www.marvel.com/comics/issue/\d+/.+",
r"https://read.marvel.com/#/book/\d+",
r"https://www.marvel.com/comics/series/\d+/.+"
]
_authentication_methods: list[str] = [ "cookies" ]
async def download(self, url: str) -> Result[str]:
match_index = self.get_match_index(url)
if match_index == 0:
issue_id = await self._get_issue_id(url)
return await self.download_book_from_id(issue_id)
if match_index == 1:
issue_id = url.split("/")[-1]
return await self.download_book_from_id(issue_id)
if match_index == 2:
return await self._download_series(url)
raise InvalidUrl
async def _download_series(self, url: str) -> Series[str]:
"""
Download series
:param url: Url of series
:returns: Series data
"""
series_id = url.split("/")[-2]
issue_ids = await self._download_issue_ids(series_id)
metadata = await self._download_series_metadata(series_id)
return Series(
title = metadata["data"]["results"][0]["title"],
book_ids = issue_ids
)
async def _download_issue_ids(self, series_id: str) -> list[str]:
"""
Download issue ids from series
:param series_id: Id of comic series on marvel.com
:returns: List of comic ids for marvel comics
"""
response = await self._client.get(
f"https://api.marvel.com/browse/comics?byType=comic_series&isDigital=1&limit=10000&byId={series_id}",
)
issue_ids = [issue["digital_id"] for issue in response.json()["data"]["results"]]
return issue_ids
async def _download_series_metadata(self, series_id: str) -> dict:
"""
Download series metadata
:param series_id: Id of comic series on marvel.com
:returns: Dictionary with metadata
"""
response = await self._client.get(
f"https://gateway.marvel.com:443/v1/public/series/{series_id}?apikey={API_KEY}",
headers = {
"Referer": "https://developer.marvel.com/"
}
)
return response.json()
async def _get_issue_id(self, url: str) -> str:
"""
Download issue id from url
:param url: Url to issue info page
:return: Issue id
"""
response = await self._client.get(url)
search = re.search(r"digital_comic_id: \"(\d+)\"", response.text)
if not search:
raise DataNotFound
return search.group(1)
async def download_book_from_id(self, issue_id: str) -> Book:
return Book(
metadata = await self._download_issue_metadata(issue_id),
data = await self._download_issue_pages(issue_id)
)
async def _download_issue_metadata(self, issue_id: str) -> Metadata:
"""
Download and parse metadata for issue
:param issue_id: Identifier for issue
:returns: Issue metadata
"""
response = await self._client.get(
f"https://bifrost.marvel.com/v1/catalog/digital-comics/metadata/{issue_id}"
)
issue_meta = response.json()["data"]["results"][0]["issue_meta"]
return Metadata(
title = issue_meta["title"],
series = issue_meta["series_title"],
publisher = "Marvel",
authors = [c["full_name"] for c in issue_meta["creators"]["extended_list"]]
)
async def _download_issue_pages(self, issue_id: str) -> ImageList:
"""
Download list of page links for issue
:param issue_id: Identifier for issue
:returns: List of links to comic pages
"""
response = await self._client.get(
f"https://bifrost.marvel.com/v1/catalog/digital-comics/web/assets/{issue_id}"
)
images = []
for page in response.json()["data"]["results"][0]["pages"]:
images.append(
OnlineFile(
url = page["assets"]["source"],
extension = "jpg"
)
)
return ImageList(images)

View File

@ -1,5 +1,8 @@
from grawlix.book import Book, Series, Result
from typing import Generic, TypeVar, Tuple, Optional
from http.cookiejar import MozillaCookieJar
import re
from typing import Generic, TypeVar, Tuple
import httpx
@ -42,6 +45,25 @@ class Source(Generic[T]):
raise NotImplementedError
@property
def supports_cookies(self) -> bool:
"""Does the source support authentication with cookie file"""
return "cookies" in self._authentication_methods
def load_cookies(self, cookie_file: str):
"""
Authenticate with source with netscape cookie file
:param cookie_file: Path to netscape cookie file
"""
if self.supports_cookies:
cookie_jar = MozillaCookieJar()
cookie_jar.load(cookie_file, ignore_expires=True)
self._client.cookies.update(cookie_jar)
self.authenticated = True
async def download(self, url: str) -> Result[T]:
"""
Download book metadata from source
@ -60,3 +82,16 @@ class Source(Generic[T]):
:returns: Downloaded book metadata
"""
raise NotImplementedError
def get_match_index(self, url: str) -> Optional[int]:
"""
Find the first regex in `self.match` that matches url
:param url: Url to match
:returns: Index of regex
"""
for index, match in enumerate(self.match):
if re.match(match, url):
return index
return None