Skip to content
Commits on Source (5)
stages:
- mirror
- test
- stats
- release
default:
image: namboy94/ci-docker-environment:0.8.0
before_script:
- echo "$SERVER_ACCESS_KEY" > ~/.ssh/id_rsa
- chmod 0600 ~/.ssh/id_rsa
github_mirror:
stage: mirror
tags: [docker]
only: [master, develop]
before_script:
- echo "$GITHUB_SSH_KEY" > ~/.ssh/id_rsa
- chmod 0600 ~/.ssh/id_rsa
script:
- git-mirror-pusher git@github.com:namboy94/manga-dl.git
master develop
stylecheck:
stage: test
tags:
- python3
tags: [docker]
script:
- python3 -m venv virtual && source virtual/bin/activate && pip install ci-scripts
- python-codestyle-check
type_check:
stage: test
tags: [docker]
script:
- python-static-type-check
unittest:
stage: test
tags:
- python3
- progstats
tags: [docker]
script:
- python3 -m venv virtual && source virtual/bin/activate && pip install ci-scripts
- python-unittest
gitstats:
stage: stats
tags: [docker]
script:
- gitstats-gen
docgen:
stage: stats
tags: [docker]
script:
- sphinx-docgen
release_upload:
stage: release
only:
- master
tags:
- python3
only: [master]
tags: [docker]
script:
- python3 -m venv virtual && source virtual/bin/activate && pip install ci-scripts
- github-release-upload $(cat version) "$(changelog-reader)"
- gitlab-release-upload $(cat version) "$(changelog-reader)"
pypi_upload:
stage: release
only:
- master
tags:
- python3
only: [master]
tags: [docker]
script:
- python3 -m venv virtual && source virtual/bin/activate && pip install ci-scripts
- pypi-upload
gitstats:
stage: stats
tags:
- python3
- gitstats
- progstats
script:
- python3 -m venv virtual && source virtual/bin/activate && pip install ci-scripts
- gitstats-gen
docgen:
stage: stats
tags:
- python3
- progstats
script:
- python3 -m venv virtual && source virtual/bin/activate && pip install ci-scripts
- sphinx-docgen
V 0.2.0:
- Added mangadex support
- Redid application structure
- Now uses docker image for CI
V 0.1.6:
- Integrated ci-scripts
V 0.1.5:
......
......@@ -8,56 +8,6 @@
This is a Manga Downloader that can download managa series from various sources.
Currently supported are:
* [Mangafox](http://mangafox.me)
## Usage
### Installation
**Via pip**
$ pip install manga_dl --user
**From source**
$ python setup.py install --user
**Binary Files**
If you would like to not install the program and instead use a single binary
file, you can download them from our [Github releases page](https://github.com/namboy94/manga-dl/releases).
### CLI
The program offers a simple wget-like CLI. To download a series, simply enter:
$ manga-dl <URL>
This will create a new directory with the series name in the current working
directory and subsequently download all currently available pages of the series
The CLI does offer some configuration:
* ```--destination```, ```-d``` (type: string)
- Specifies the directory in which the series will be stored
- Defaults to the current working directory + the series name
* ```--threads```, ```-t``` (type: integer)
- Specifies how many threads the program may use. Using more threads may speed up the parsing
- Defaults to the amount of logical processors the system has
* ```--verbose```, ```-v```
- If set, the program will print it's progress to the console
* ```--update```
- If set, the program will only check for content that have not been downloaded yet
* ```--repair```
- If set, the program will check each previously downloaded file for errors
- If errors are found, the file in question is re-downloaded
* ```--zip-volumes```
- Zips the series by volume after the download is completed
* ```--zip-chapters```
- Zips the series by chapter after the download is completed
## Further Information
* [Changelog](CHANGELOG)
......
......@@ -17,8 +17,72 @@ You should have received a copy of the GNU General Public License
along with manga-dl. If not, see <http://www.gnu.org/licenses/>.
"""
from manga_dl.main import main
import os
import argparse
from typing import List
from manga_dl.scrapers import scrapers
from manga_dl import sentry_dsn
from manga_dl.entities.Chapter import Chapter
from puffotter.init import cli_start, argparse_add_verbosity
def main(args: argparse.Namespace):
"""
The main script of the manga-dl program
:return: None
"""
chapters = [] # type: List[Chapter]
if args.url is not None:
scraper = None
for scraper_cls in scrapers:
if scraper_cls.url_matches(args.url):
scraper = scraper_cls(
destination=args.out, _format=args.format
)
break
chapters = scraper.load_chapters(args.url)
user_chapters = args.chapters
if user_chapters is not None:
chapter_numbers = user_chapters.split(",")
chapter_numbers = list(map(lambda x: x.strip(), chapter_numbers))
chapters = list(filter(
lambda x: x.chapter_number in chapter_numbers,
chapters
))
for c in chapters:
c.download()
if __name__ == "__main__":
main()
parser = argparse.ArgumentParser()
parser.add_argument("url", default=None,
help="The URL from which to download. May be left "
"blank if a specific ID for a connector was "
"provided.")
parser.add_argument("-c", "--chapters",
help="Specifies which chapters to download")
parser.add_argument("-o", "--out",
default=os.path.join(os.path.expanduser("~"),
"Downloads/Manga"),
help="Specifies the output path")
parser.add_argument("-f", "--format",
choices={"cbz", "raw"}, default="cbz",
help="The format in which to store the chapters")
for _scraper_cls in scrapers:
parser.add_argument("--{}-id".format(_scraper_cls.name()),
help="Uses specific {} ID instead of an URL"
.format(_scraper_cls.name()))
argparse_add_verbosity(parser)
cli_start(
main,
parser,
"Thanks for using manga-dl!",
"manga-dl",
sentry_dsn
)
......@@ -16,3 +16,9 @@ GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with manga-dl. If not, see <http://www.gnu.org/licenses/>.
LICENSE"""
sentry_dsn = "https://0c1dcd24a5c346e09115ffebdb780772@sentry.namibsun.net/9"
"""
Sentry DSN used for exception logging
"""
"""LICENSE
Copyright 2015 Hermann Krumrey <hermann@krumreyh.com>
This file is part of manga-dl.
manga-dl is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
manga-dl is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with manga-dl. If not, see <http://www.gnu.org/licenses/>.
LICENSE"""
import os
import shutil
import logging
import cfscrape
from puffotter.os import makedirs
from typing import Callable, List
from typing import Optional
from subprocess import Popen, DEVNULL
from urllib.request import urlretrieve
class Chapter:
"""
Class that models a manga chapter
"""
def __init__(
self,
url: str,
language: str,
series_name: str,
chapter_number: str,
destination_dir: str,
_format: str,
page_load_callback: Callable[['Chapter', str], List[str]],
title: Optional[str] = None
):
"""
Initializes the manga chapter
:param url: The URL used to fetch page image URLs
:param language: The language of the chapter
:param series_name: The name of the series
:param chapter_number: The chapter number of this chapter
:param destination_dir: The destination directory in which to store
downloaded files by default
:param _format: The format in which to store the chapter when
downloading by default
:param title: The title of the chapter
:param page_load_callback:
"""
self.logger = logging.getLogger(self.__class__.__name__)
self.url = url
self.language = language
self.series_name = series_name
self.chapter_number = chapter_number
self.destination_dir = destination_dir
self.format = _format
self._page_load_callback = page_load_callback
self._pages = [] # type: List[str]
self.title = title
if self.chapter_number == "":
self.chapter_number = "0"
@property
def name(self) -> str:
"""
:return: The name of the chapter
"""
name = "{} - Chapter {}".format(self.series_name, self.chapter_number)
if self.title is not None:
name += " - " + self.title
return name
@property
def pages(self) -> List[str]:
"""
Lazy-loads the URLs of the chapter's page images
:return: The list of page images, in the correct order
"""
if len(self._pages) == 0:
self._pages = self._page_load_callback(self, self.url)
return self._pages
def download(
self,
file_path_override: Optional[str] = None,
format_override: Optional[str] = None
) -> str:
"""
Downloads the chapter to a local file or directory
:param file_path_override: Overrides the automatically generated
destination file path
:param format_override: Overrides the class-wide format
:return: The path to the downloaded chapter file/directory
"""
_format = self.format if format_override is None else format_override
tempdir = os.path.join("/tmp", self.name)
makedirs(tempdir, delete_before=True)
dest_path = os.path.join(self.destination_dir, self.name)
if file_path_override:
dest_path = file_path_override
if not dest_path.endswith("." + _format) and _format != "dir":
dest_path += "." + _format
makedirs(os.path.dirname(dest_path))
index_fill = len(str(len(self.pages)))
downloaded = []
for i, image_url in enumerate(self.pages):
cloudflare = False
if image_url.startswith("CF!"):
image_url = image_url[3:]
cloudflare = True
ext = image_url.rsplit(".", 1)[1]
filename = "{}.{}".format(str(i).zfill(index_fill), ext)
image_file = os.path.join(tempdir, filename)
self.logger.info("Downloading image file {} to {}"
.format(image_url, image_file))
if cloudflare:
scraper = cfscrape.create_scraper()
content = scraper.get(image_url).content
with open(image_file, "wb") as f:
f.write(content)
else:
urlretrieve(image_url, image_file)
downloaded.append(image_file)
if _format in ["cbz", "zip"]:
self.logger.debug("Zipping Files")
Popen(["zip", "-j", dest_path] + downloaded,
stdout=DEVNULL, stderr=DEVNULL).wait()
shutil.rmtree(tempdir)
elif _format == "dir":
os.rename(tempdir, dest_path)
else:
self.logger.warning("Invalid format {}".format(_format))
return dest_path
def __str__(self) -> str:
"""
:return: The string representation of the object
"""
return self.name
"""LICENSE
Copyright 2015 Hermann Krumrey <hermann@krumreyh.com>
This file is part of manga-dl.
manga-dl is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
manga-dl is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with manga-dl. If not, see <http://www.gnu.org/licenses/>.
LICENSE"""
# imports
from typing import List
from manga_dl.entities.MangaPage import MangaPage
class MangaChapter(object):
"""
Class that models a Chapter of the manga
Contains scrapers links to the individual chapter pages
"""
pages = []
"""
The individual pages of the chapter
"""
chapter_number = -1.0
"""
The chapter's chapter number
Is a float to allow chapters like 5.5 or the like
"""
chapter_name = ""
"""
The chapter's name. Is normally generated by the get_chapter_name method
"""
def __init__(self, chapter_number: float, pages: List[MangaPage]) -> None:
"""
Initializes a Manga chapter with the contained pages.
:param chapter_number: The chapter number of this chapter
:param pages: the chapter pages as list of MangaPage objects
:return: None
"""
self.chapter_number = chapter_number
self.pages = pages
def set_chapter_name(self, name: str) -> None:
"""
Sets the chapter's name
:param name: the name of the chapter
:return: None
"""
self.chapter_name = name
def get_chapter_name(self) -> str:
"""
Generates a Chapter name for this chapter based on the chapter number,
or returns the chapter name previously set by the set_chapter_name
method.
:return: the chapter name as a string
"""
if self.chapter_name == "":
if self.chapter_number.is_integer():
return "Chapter " + str(int(self.chapter_number)).zfill(3)
else:
pre_dot, post_dot = str(self.chapter_number).split(".")
return "Chapter " + pre_dot.zfill(3) + "." + post_dot
else:
return self.chapter_name
def get_pages(self) -> List[MangaPage]:
"""
:return: a list of manga pages belonging to this chapter
"""
return self.pages
"""LICENSE
Copyright 2015 Hermann Krumrey <hermann@krumreyh.com>
This file is part of manga-dl.
manga-dl is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
manga-dl is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with manga-dl. If not, see <http://www.gnu.org/licenses/>.
LICENSE"""
class MangaPage(object):
"""
Class that models a page in a Manga chapter
"""
page_number = -1
"""
The page's page number
"""
image_url = ""
"""
The URL to the page's image
"""
page_name = ""
"""
The page's name. Is normally generated by the get_page_name method
"""
def __init__(self, page_number: int, image_url: str) -> None:
"""
Initializes the Manga Page with a page number and an image URL
:param page_number: the page number
:param image_url: the image URL
:return: None
"""
self.page_number = page_number
self.image_url = image_url
def set_page_name(self, name: str) -> None:
"""
Sets the page's name
:param name: the name of the page
:return: None
"""
self.page_name = name
def get_page_name(self) -> str:
"""
Generates a Page name for this page based on the page number, or
returns the page name previously set by the set_page_name method.
:return: the page name as a string
"""
return self.page_name if self.page_name != "" \
else "Page " + str(self.page_number).zfill(3)
"""LICENSE
Copyright 2015 Hermann Krumrey <hermann@krumreyh.com>
This file is part of manga-dl.
manga-dl is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
manga-dl is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with manga-dl. If not, see <http://www.gnu.org/licenses/>.
LICENSE"""
import os
import shutil
import requests
from typing import Tuple
from multiprocessing import Pool
from manga_dl.scrapers.MangaScraperManager import MangaScraperManager
class MangaScraperNotFoundError(Exception):
"""
Exception raised when no applicable manga scraper was found
"""
class MangaSeries(object):
"""
Class that models a Manga series. It is the entry point for all operations
related to downloading, repairing and zipping manga series.
It offers an automatic scraper detection system that tries to find a
fitting scraper for the URL provided
"""
url = ""
"""
The manga series' URL
"""
root_directory = ""
"""
The root directory of the downloaded manga
"""
scraper = None
"""
The scraper used to find the volumes belonging to the series
"""
volumes = []
"""
List of volumes of the series
"""
verbose = False
"""
Flag that can be set to enable console output
"""
dry_run = False
"""
Flag that can be set to disable any changes to the system,
i.e. a dry run akin to the rsync dry run flag '-n'
"""
max_threads = 1
"""
Defines the maximum number of concurrent threads while scraping
"""
def __init__(self, url: str, root_directory: str) -> None:
"""
Initializes the Manga series
:param url: the URL for where to look for volumes to scrapers
:param root_directory: the directory in which the local copy of the
series resides in
:raises: MangaScraperNotFound, if no applicable manga scraper was found
"""
self.url = url
self.root_directory = root_directory
# Automatically find the correct scraper
self.scraper = MangaScraperManager.get_scraper_for(url)
if self.scraper is None:
raise MangaScraperNotFoundError()
def scrape(self, skip_existing_chapters: bool = False) -> None:
"""
Finds a list of all volumes using the scraper found in the __init__
method.
:param skip_existing_chapters: Can be set to skip existing chapters
:return: None
"""
if self.scraper is not None and len(self.volumes) == 0:
self.volumes = self.scraper.scrape_volumes_from_url(
self.url,
self.root_directory,
skip_existing_chapters=skip_existing_chapters,
max_threads=self.max_threads,
verbose=self.verbose
)
def download_manga(self, update: bool = False, repair: bool = False):
"""
Starts downloading the manga series
:param update: flag to set an update process,
i.e. only downloads files that don't exist
:param repair: flag to set a repair process,
i.e. updates + checks if files are OK
:return: None
"""
if self.verbose:
print("Scraping " + self.url)
if update:
self.scrape(skip_existing_chapters=True)
else:
self.scrape()
if not self.dry_run and not os.path.isdir(self.root_directory):
os.makedirs(self.root_directory)
download_parameters = []
for volume in self.volumes:
volume_directory = os.path.join(
self.root_directory,
volume.get_volume_name()
)
if not self.dry_run and not os.path.isdir(volume_directory):
os.makedirs(volume_directory)
for chapter in volume.get_chapters():
chapter_directory = os.path.join(
volume_directory,
chapter.get_chapter_name()
)
if not self.dry_run and not os.path.isdir(chapter_directory):
os.makedirs(chapter_directory)
for page in chapter.get_pages():
page_file = os.path.join(
chapter_directory,
page.get_page_name() + ".jpg"
)
download_parameters.append((page.image_url,
page_file,
not update,
repair,
self.verbose,
self.dry_run))
threadpool = Pool(self.max_threads)
threadpool.map(MangaSeries.download_file, download_parameters)
threadpool.close()
threadpool.join()
def update(self) -> None:
"""
Updates the current directory with volumes and chapters
that do not exist yet.
:return: None
"""
self.download_manga(update=True)
def repair(self) -> None:
"""
Updates the current directory with volumes and chapters that do not
exist yet. While doing so, every file is checked for consistency
and replaced if needed.
:return: None
"""
self.download_manga(repair=True)
def zip(self, zip_volumes: bool = False, zip_chapters: bool = False):
"""
Zips parts of the series together to enable reading in some manga
readers, like ComicRack for android
:param zip_volumes: flag to enable zipping volumes
:param zip_chapters: flag to enable zipping chapters
:return: None
"""
for volume in os.listdir(self.root_directory):
volume_dir = os.path.join(self.root_directory, volume)
if volume.startswith("Volume ") and os.path.isdir(volume_dir):
if zip_volumes:
if not self.dry_run:
shutil.make_archive(volume_dir, "zip", volume_dir)
if zip_chapters:
for chapter in os.listdir(volume_dir):
chapter_dir = os.path.join(volume_dir, chapter)
if chapter.startswith("Chapter ") \
and os.path.isdir(chapter_dir):
if not self.dry_run:
shutil.make_archive(
chapter_dir,
"zip",
chapter_dir
)
def zip_chapters(self) -> None:
"""
Zips the series by chapter
:return: None
"""
self.zip(zip_chapters=True)
def zip_volumes(self) -> None:
"""
Zips the series by volume
:return: None
"""
self.zip(zip_volumes=True)
def zip_all(self) -> None:
"""
Zips the series by Volume and then by chapter
:return: None
"""
self.zip(zip_volumes=True, zip_chapters=True)
def set_verbose(self, verbose: bool = True) -> None:
"""
Sets the verbosity flag
:param verbose: the new value of the verbosity flag, defaults to True
:return: None
"""
self.verbose = verbose
def set_dry_run(self, dry_run: bool = True) -> None:
"""
Sets the dry_run flag
:param dry_run: the new value of the dry_run flag, defaults to True
:return: None
"""
self.dry_run = dry_run
def set_maximum_thread_amount(self, max_threads: int) -> None:
"""
Sets the maximum amount of threads to be used
:param max_threads: the new thread maximum
:return: None
"""
self.max_threads = max_threads
@staticmethod
def download_file(options: Tuple[str, str, bool, bool, bool, bool]) \
-> None:
"""
Downloads a file, can also be used to repair previously downloaded
files. Can be run in parallel using the multiprocessing Pool class.
This limits the function to a single parameter, a Tuple in this case
:param options: Tuple containing the following parameters:
url: the file's URL
destination: the local destination for the file
overwrite_existing: flag that enables overwriting
existing files
repair: flag that can be set to enable repair mode
verbose: Sets the verbosity flag
dry_run: Sets the dry run flag
:return: None
"""
url, destination, overwrite_existing, repair, verbose, dry_run = \
options
print("Downloading " + url)
if not overwrite_existing and os.path.isfile(destination):
return
if repair:
try:
url_size = int(requests.head(url).headers["Content-Length"])
file_size = os.path.getsize(destination)
if url_size == file_size:
return
elif verbose:
print("Local file broken, starting repair")
except FileNotFoundError:
if verbose:
print("File does not exist")
pass
if not dry_run:
with open(destination, 'wb') as destination_file:
if verbose:
print("Downloading " + destination)
content = requests.get(url).content
destination_file.write(content)
"""LICENSE
Copyright 2015 Hermann Krumrey <hermann@krumreyh.com>
This file is part of manga-dl.
manga-dl is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
manga-dl is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with manga-dl. If not, see <http://www.gnu.org/licenses/>.
LICENSE"""
# import
from typing import List
from manga_dl.entities.MangaChapter import MangaChapter
class MangaVolume(object):
"""
Class that models a Manga chapter.
Contains a list of chapters
"""
chapters = []
"""
List of Manga chapters in this volume
"""
volume_number = -1
"""
The volume's Volume Number
"""
volume_name = ""
"""
The volume's name. Is normally generated by the get_volume_name method
"""
def __init__(self, volume_number: int, chapters: List[MangaChapter]) \
-> None:
"""
Initializes a Manga volume with a list of chapters
:param volume_number: The volume number of this volume
:param chapters: the chapters belonging to this volume
:return: None
"""
self.volume_number = volume_number
self.chapters = chapters
def set_volume_name(self, name: str) -> None:
"""
Sets the volume's name
:param name: the name of the volume
:return: None
"""
self.volume_name = name
def get_volume_name(self) -> str:
"""
Generates a Volume name for this volume based on the volume number,
or returns the volume name previously set by the set_volume_name
method.
:return: the volume name as a string
"""
return self.volume_name if self.volume_name != "" \
else "Volume " + str(self.volume_number).zfill(2)
def get_chapters(self) -> List[MangaChapter]:
"""
:return: a list of manga chapters belonging to this volume
"""
return self.chapters
"""LICENSE
Copyright 2015 Hermann Krumrey <hermann@krumreyh.com>
This file is part of manga-dl.
manga-dl is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
manga-dl is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with manga-dl. If not, see <http://www.gnu.org/licenses/>.
LICENSE"""
# imports
import os
import argparse
import multiprocessing
from manga_dl.scrapers.MangaScraperManager import MangaScraperManager
from manga_dl.entities.MangaSeries import MangaSeries, \
MangaScraperNotFoundError
def main() -> None:
"""
Parses CLI arguments and starts the program
:return: None
"""
try:
parser = argparse.ArgumentParser()
parser.add_argument("url")
parser.add_argument("-d", "--destination")
parser.add_argument("-t", "--threads", type=int,
default=multiprocessing.cpu_count())
parser.add_argument("-v", "--verbose", action="store_true",
default=False)
parser.add_argument("--zip_chapters", action="store_true",
default=False)
parser.add_argument("--zip_volumes", action="store_true",
default=False)
parser.add_argument("--repair", action="store_true", default=False)
parser.add_argument("--update", action="store_true", default=False)
args = parser.parse_args()
if args.url:
series_name = \
MangaScraperManager.get_series_name_from_url(args.url)
destination = \
os.path.join(os.getcwd(), series_name) if not args.destination\
else args.destination
series = MangaSeries(args.url, destination)
series.set_verbose(args.verbose)
series.set_maximum_thread_amount(args.threads)
series.download_manga(update=args.update, repair=args.repair)
series.zip(
zip_chapters=args.zip_chapters,
zip_volumes=args.zip_volumes
)
else:
print(
"No valid argument combination supplied. "
"See --help for more details"
)
except MangaScraperNotFoundError:
print("The provided URL is not supported")
except KeyboardInterrupt:
print("Thanks for using manga_dl!")
"""LICENSE
Copyright 2015 Hermann Krumrey <hermann@krumreyh.com>
This file is part of manga-dl.
manga-dl is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
manga-dl is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with manga-dl. If not, see <http://www.gnu.org/licenses/>.
LICENSE"""
# imports
from typing import List
from manga_dl.entities.MangaVolume import MangaVolume
class GenericMangaScraper(object):
"""
Class that models how a Manga Scraper should operate
"""
@staticmethod
def url_match(manga_url: str) -> bool:
"""
Checks if a URL matches the pattern expected by the scraper
:param manga_url: the URL to check
:return: True if it matches, False otherwise
"""
raise NotImplementedError()
@staticmethod
def get_series_name(manga_url: str) -> str:
"""
Parses the URL to determine the series name
:param manga_url: The URL to parse
:return: The series name
"""
raise NotImplementedError()
@staticmethod
def scrape_volumes_from_url(manga_url: str, manga_directory: str,
skip_existing_chapters: bool = False,
max_threads: int = 1, verbose: bool = False) \
-> List[MangaVolume]:
"""
Scrapes a given URL
:param manga_url: the given URL to scrape
:param manga_directory: the manga directory,
which can be used to skip existing chapters
:param skip_existing_chapters: Flag that can be set to skip existing
chapters,
thereby increasing scraping speed
:param max_threads: the maximum numbers of threads to use
:param verbose: Sets the verbosity flag. Defaults to no output
:return: a list of volumes, which should also contain chapters
"""
raise NotImplementedError()
"""LICENSE
Copyright 2015 Hermann Krumrey <hermann@krumreyh.com>
This file is part of manga-dl.
manga-dl is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
manga-dl is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with manga-dl. If not, see <http://www.gnu.org/licenses/>.
LICENSE"""
import os
import requests
from bs4 import BeautifulSoup
from typing import List, Tuple
from multiprocessing import Pool
from manga_dl.scrapers.GenericMangaScraper import GenericMangaScraper
from manga_dl.entities.MangaChapter import MangaChapter
from manga_dl.entities.MangaPage import MangaPage
from manga_dl.entities.MangaVolume import MangaVolume
class MangaFoxScraper(GenericMangaScraper):
"""
Class that models how a Manga Scraper should operate
"""
@staticmethod
def parse_page(options: Tuple[int, bool, str]) -> MangaPage:
"""
Parses a single page of a chapter. Can be run in parallel using
multiprocessing.Pool, which is the reason why the arguments are all
passed via a single Tuple, due to the limitations of Pool.map()
:param options: the options for the page to parse:
image_number: The image number of the image page
to parse
verbose: Enabling or disabling verbose output
chapter_base_url: The base URL of the chapter
:return: the scraped manga page object
"""
image_number, verbose, chapter_base_url = options
image_page_url = chapter_base_url + "/" + str(image_number) + ".html"
# When using multiple threads, sometimes 503 errors occur,
# which is why we try until we succeed!
result = requests.get(image_page_url)
while result.status_code != 200:
result = requests.get(image_page_url)
image_html = result.text
image_soup = BeautifulSoup(image_html, "html.parser")
if len(str(image_soup)) == 204:
print(image_soup)
raise Exception
image = image_soup.select("img")[0]
image_url = str(image).split("src=\"")[1].split("\"")[0]
image_url = image_url.replace("amp;", "")
return MangaPage(image_number, image_url)
@staticmethod
def get_series_name(manga_url: str) -> str:
"""
Returns the very end of the URL, as this is the name of the series
:param manga_url: The URL to parse
:return: The series name
"""
series_name = manga_url.rsplit("/", 1)[1].title().replace("_", " ")
if not series_name:
series_name = manga_url.rsplit("/", 2)[1].title().replace("_", " ")
return series_name.title()
@staticmethod
def url_match(manga_url: str) -> bool:
"""
Checks if a URL matches the pattern expected by the scraper
:param manga_url: the URL to check
:return: True if it matches, False otherwise
"""
return manga_url.startswith("http://mangafox.me")
@staticmethod
def scrape_volumes_from_url(manga_url: str, manga_directory: str,
skip_existing_chapters: bool = False,
max_threads: int = 1, verbose: bool = False) \
-> List[MangaVolume]:
"""
Scrapes a given URL from mangafox.me
:param manga_url: the given URL to scrape
:param manga_directory: the manga directory,
which can be used to skip existing chapters
:param skip_existing_chapters: Flag that can be set to skip existing
chapters,
thereby increasing scraping speed
:param max_threads: The maximum amount of threads that can be used
:param verbose: Sets the verbosity flag. Defaults to no output
:return: a list of volumes, which should also contain chapters
"""
html = requests.get(manga_url).text
soup = BeautifulSoup(html, "html.parser")
volumes = soup.select(".chlist")
# Find the highest volume number
# Sometimes a 'Volume 00' exists,
# which then results in us having to decrement the highest number by 1
volume_number = len(volumes)
if "\"Volume 00\"" in html:
volume_number -= 1
volume_objects = []
for volume in volumes:
if verbose:
print("Scraping Volume " + str(volume_number))
chapters = volume.select(".tips")
chapter_objects = []
for chapter in chapters:
chapter_start_url = \
str(chapter).split("href=\"")[1].split("\"")[0]
chapter_base_url = chapter_start_url.rsplit("/", 1)[0]
chapter_number = float(chapter.text.rsplit(" ", 1)[1])
if chapter_number.is_integer():
formatted_chapter_number = \
str(int(chapter_number)).zfill(3)
else:
pre_dot, post_dot = str(chapter_number).split(".")
formatted_chapter_number =\
pre_dot.zfill(3) + "." + post_dot
chapter_directory = os.path.join(
manga_directory,
"Volume " + str(volume_number).zfill(2)
)
chapter_directory = os.path.join(
chapter_directory,
"Chapter " + formatted_chapter_number
)
if verbose:
print("Scraping Chapter " + str(chapter_number))
chapter_html = requests.get(chapter_start_url).text
chapter_soup = BeautifulSoup(chapter_html, "html.parser")
page_amount = int(str(
chapter_soup.select(".l")[0])
.rsplit("of ", 1)[1]
.split("\t", 1)[0]
) # Don't ask
if os.path.isdir(chapter_directory) and skip_existing_chapters:
if page_amount == len(os.listdir(chapter_directory)):
if verbose:
print("Skipping Chapter " +
formatted_chapter_number)
continue
poolsize = page_amount if page_amount < max_threads \
else max_threads
threadpool = Pool(processes=poolsize)
page_arguments = []
for number in range(1, page_amount + 1):
page_arguments.append((number, verbose, chapter_base_url))
page_objects = threadpool.map(
MangaFoxScraper.parse_page,
page_arguments
)
# Waits for threads to finish
threadpool.close()
threadpool.join()
chapter_objects.append(MangaChapter(
chapter_number,
page_objects
))
volume_objects.append(MangaVolume(volume_number, chapter_objects))
volume_number -= 1
return volume_objects
"""LICENSE
Copyright 2015 Hermann Krumrey <hermann@krumreyh.com>
This file is part of manga-dl.
manga-dl is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
manga-dl is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with manga-dl. If not, see <http://www.gnu.org/licenses/>.
LICENSE"""
from manga_dl.scrapers.GenericMangaScraper import GenericMangaScraper
from manga_dl.scrapers.MangaFoxScraper import MangaFoxScraper
class MangaScraperManager(object):
"""
Class that acts as a negotiator for the various manga scrapers
"""
scrapers = [MangaFoxScraper]
"""
A list of scrapers that are implemented
"""
@staticmethod
def get_scraper_for(manga_url: str) -> type(GenericMangaScraper):
"""
Returns the correct scraper for a specified manga URL
:param manga_url: the URL of the Manga series
:return: The correct scraper, or None if none was found
"""
for scraper in MangaScraperManager.scrapers:
if scraper.url_match(manga_url):
return scraper
return None
@staticmethod
def get_series_name_from_url(manga_url: str) -> str:
"""
Tryes to figure out the name of a manga series from its URL
:param manga_url: The URL to check
:return: The series name,
or an emtpy string if no applicable parser exists
"""
scraper = MangaScraperManager.get_scraper_for(manga_url)
return scraper.get_series_name(manga_url) if scraper is not None \
else ""
"""LICENSE
Copyright 2015 Hermann Krumrey <hermann@krumreyh.com>
This file is part of manga-dl.
manga-dl is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
manga-dl is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with manga-dl. If not, see <http://www.gnu.org/licenses/>.
LICENSE"""
import logging
from typing import Optional, List, Set
from manga_dl.entities.Chapter import Chapter
class Scraper:
"""
Specifies the Capabilities of a manga download site scraper
"""
def __init__(
self,
_format: str = "cbz",
destination: Optional[str] = None,
languages: Optional[Set[str]] = None
):
"""
Initializes the Scraper object
:param _format: The format in which to store chapters
:param destination: The destination directory in
which to store chapters
:param languages: Set of languages for which to check
"""
self.logger = logging.getLogger(self.__class__.__name__)
self.format = _format
self.destination = destination
if languages is None:
self.languages = {"english", "gb", "us"}
else:
self.languages = languages
@classmethod
def name(cls) -> str:
"""
:return: The name of the scraper
"""
raise NotImplementedError()
@classmethod
def url_matches(cls, url: str) -> bool:
"""
Checks whether or not an URL matches for the scraper
:param url: The URL to check
:return: Whether the URL is valid
"""
raise NotImplementedError()
def generate_url(self, _id: str) -> str:
"""
Generates an URL based on an ID
:param _id: The ID to use
:return: The generated URL
"""
raise NotImplementedError()
def load_chapters(
self,
url: Optional[str] = None,
_id: Optional[str] = None
) -> List[Chapter]:
"""
Loads a list of Chapter objects for an URL or ID
Only one of either an URL or an ID is required
:param url: The URL
:param _id: The ID
:return: The list of chapters
"""
if url is None and _id is None:
self.logger.warning("Neither URL or ID provided. Can't continue.")
return []
elif url is not None and not self.url_matches(url):
self.logger.warning("Invalid URL. Can't continue.")
return []
elif _id is not None:
url = self.generate_url(_id)
chapters = self._load_chapters(str(url))
# Both sort steps are necessary!
chapters.sort(
key=lambda x: str(x.chapter_number).zfill(15)
)
chapters.sort(
key=lambda x: str(x.chapter_number.split(".")[0]).zfill(15)
)
return list(filter(lambda x: x.language in self.languages, chapters))
def _load_chapters(self, url: str) -> List[Chapter]:
"""
Scraper-specific implementation that loads chapters from the website
:param url: The URL to scrape
:return: The list of chapters found while scraping
"""
raise NotImplementedError()
......@@ -16,3 +16,10 @@ GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with manga-dl. If not, see <http://www.gnu.org/licenses/>.
LICENSE"""
from manga_dl.scrapers.mangadex import MangaDexScraper
scrapers = [MangaDexScraper]
"""
A list of manga scrapers
"""
"""LICENSE
Copyright 2015 Hermann Krumrey <hermann@krumreyh.com>
This file is part of manga-dl.
manga-dl is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
manga-dl is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with manga-dl. If not, see <http://www.gnu.org/licenses/>.
LICENSE"""
import re
import json
import cfscrape
from typing import List
from manga_dl.entities.Chapter import Chapter
from manga_dl.scrapers.Scraper import Scraper
class MangaDexScraper(Scraper):
"""
Scraper for mangadex.org
"""
@classmethod
def name(cls) -> str:
"""
:return: The name of the scraper
"""
return "mangadex"
@classmethod
def url_matches(cls, url: str) -> bool:
"""
Checks whether or not an URL matches for the scraper
:param url: The URL to check
:return: Whether the URL is valid
"""
return bool(re.match(r"^https://mangadex.org/title/[0-9]+", url))
def generate_url(self, _id: str) -> str:
"""
Generates an URL based on an ID
:param _id: The ID to use
:return: The generated URL
"""
return "https://mangadex.org/title/" + _id
def _load_chapters(self, url: str) -> List[Chapter]:
"""
Loads the chapters from mangadex.org
:param url: The URL to scrape
:return: The chapters found for the series
"""
scraper = cfscrape.create_scraper()
mangadex_id = url.split("https://mangadex.org/title/")[1].split("/")[0]
manga_url = "https://mangadex.org/api/manga/" + str(mangadex_id)
resp = scraper.get(manga_url)
if resp.status_code >= 300:
self.logger.warning("Unsuccessful request ({})"
.format(resp.status_code))
self.logger.debug(resp.text)
return []
series_info = json.loads(resp.text)
series_title = series_info["manga"]["title"]
chapter_list = series_info.get("chapter", {})
if self.destination is None:
destination = series_title
else:
destination = self.destination
chapters = []
for chapter_id, chapter in chapter_list.items():
chapter_url = "https://mangadex.org/api/chapter/" + str(chapter_id)
chapters.append(Chapter(
chapter_url,
chapter["lang_code"],
series_title,
chapter["chapter"],
destination,
self.format,
self.get_image_pages,
chapter["title"],
))
return chapters
@staticmethod
def get_image_pages(_self: Chapter, url: str) -> List[str]:
"""
Callback method for the Chapter object.
Loads the correct image URL for a page
:param _self: The chapter that calls this method
:param url: The base chapter URL
:return: The page image URLs
"""
scraper = cfscrape.create_scraper()
resp = scraper.get(url)
if resp.status_code >= 300:
_self.logger.warning("Unsuccessful request ({})"
.format(resp.status_code))
_self.logger.debug(resp.text)
return []
chapter_info = json.loads(resp.text)
image_urls = []
server = chapter_info["server"]
if server == "/data/":
server = "CF!https://mangadex.org/data/" # Cloudflare protected
chapter_hash = chapter_info["hash"]
base_url = server + chapter_hash + "/"
for page in chapter_info["page_array"]:
image_urls.append(base_url + page)
return image_urls
......@@ -40,7 +40,13 @@ if __name__ == "__main__":
license="GNU GPL3",
packages=find_packages(),
scripts=list(map(lambda x: os.path.join("bin", x), os.listdir("bin"))),
install_requires=["bs4", "requests"],
install_requires=[
"bs4",
"requests",
"sentry-sdk",
"puffotter",
"cfscrape"
],
include_package_data=True,
zip_safe=False
)
"""LICENSE
Copyright 2015 Hermann Krumrey <hermann@krumreyh.com>
This file is part of manga-dl.
manga-dl is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
manga-dl is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with manga-dl. If not, see <http://www.gnu.org/licenses/>.
LICENSE"""