diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index 187cdef6b..000000000 --- a/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -frontera/_version.py export-subst diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 000000000..00ec9dc8a --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,25 @@ +name: Publish +on: + push: + tags: + - '[0-9]+.[0-9]+.[0-9]+' +jobs: + publish: + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/${{ github.event.repository.name }} + permissions: + id-token: write + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.13 + - run: | + python -m pip install --upgrade build + python -m build + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_TOKEN }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 000000000..37c47b60e --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,91 @@ +# TODO: Run Docker images to run tests that are otherwise skipped. Run ‘tox -e +# all -- tests -rs’ to get a list of those services. +name: test +on: + push: + branches: [ main ] + pull_request: +jobs: + test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - python-version: '3.9' + toxenv: min + - python-version: '3.9' + toxenv: min-hbase + - python-version: '3.9' + toxenv: min-kafka + - python-version: '3.9' + toxenv: min-logging + - python-version: '3.9' + toxenv: min-s3 + - python-version: '3.9' + toxenv: min-scrapy + - python-version: '3.9' + toxenv: min-sql + - python-version: '3.9' + toxenv: min-zeromq + - python-version: '3.9' + toxenv: min-all + - python-version: '3.9' + - python-version: '3.10' + - python-version: '3.11' + - python-version: '3.11' + toxenv: kafka + - python-version: '3.12' + - python-version: '3.13' + - python-version: '3.13' + toxenv: hbase + - python-version: '3.13' + toxenv: logging + - python-version: '3.13' + toxenv: s3 + - python-version: '3.13' + toxenv: scrapy + - python-version: '3.13' + toxenv: sql + - python-version: '3.13' + toxenv: zeromq + - python-version: '3.13' + toxenv: all + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install tox + - name: tox + run: | + tox -e ${{ matrix.toxenv || 'py' }} + - name: coverage + if: ${{ success() }} + uses: codecov/codecov-action@v4.0.1 + with: + token: ${{ secrets.CODECOV_TOKEN }} + check: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.13"] + tox-job: ["pre-commit"] + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install tox + - name: tox + run: | + tox -e ${{ matrix.tox-job }} \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..feb22619c --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,8 @@ +repos: +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.9.4 + hooks: + - id: ruff + args: [ --fix ] + - id: ruff-format + diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index a2f138639..000000000 --- a/.travis.yml +++ /dev/null @@ -1,69 +0,0 @@ -language: python -python: 2.7 -branches: - only: - - master - - /^\d\.\d+$/ - - /^\d\.\d+\.\d+(rc\d+|dev\d+)?$/ - -services: - - docker - - mysql - - postgresql - -env: - global: - - DOCKER_COMPOSE_VERSION=1.7.1 - matrix: - - TOXENV=py27 - - TOXENV=flake8 - -matrix: - include: - - python: 3.5 - env: TOXENV=py35 - services: - - docker - - mysql - - postgresql - -install: - - pip install -U tox wheel codecov - - pip install -r requirements/tests.txt - -before_install: - - sudo apt-get update - - sudo rm -f /usr/local/bin/docker-compose - - curl -L https://github.com/docker/compose/releases/download/${DOCKER_COMPOSE_VERSION}/docker-compose-`uname -s`-`uname -m` > docker-compose - - chmod +x docker-compose - - sudo mv docker-compose /usr/local/bin - - git clone https://github.com/scrapinghub/hbase-docker.git - - docker pull scrapinghub/hbase-docker - - mkdir data - - ./hbase-docker/start-hbase.sh - -before_script: - - mysql -u root -e "set global innodb_large_prefix=1;" - - mysql -u root -e "set global innodb_file_format='Barracuda';" - - mysql -u root -e "set global innodb_file_per_table=true;" - - tests/run_zmq_broker.sh - - docker --version - - docker-compose --version - - docker-compose --verbose -f tests/kafka/docker-compose.yml up -d - - docker ps -a - -script: tox - -after_success: - - codecov - -deploy: - provider: pypi - distributions: sdist bdist_wheel - user: scrapinghub - password: - secure: bG1ycgBJrVfgBsHAwaWScL2V1x52/aZy2znE2kFzFfYq95HS51mu1GO2PKwpATJyMCs/xUIKGhiBtoo8LzNmNU2kPgarQFwzpLkx2ninOm+3uAwuIfGGWsonBW/h854n+cNwppJOlfEOxvMVerVVpdV4EFeJfEkVYascwOBbnqo= - on: - branch: master - tags: true - condition: $TOXENV == py27 diff --git a/CHANGES.rst b/CHANGES.rst new file mode 100644 index 000000000..42a4c6145 --- /dev/null +++ b/CHANGES.rst @@ -0,0 +1,35 @@ +================== +frontera changelog +================== + +0.7.2 (unreleased) +================== + +- Dropped support for Python 3.8 and lower, add support for Python 3.9 and + higher. + +- Dependency updates: + + - | core: + | ``six`` is no longer a dependency + | ``w3lib``: ``>=1.15.0`` → ``>=1.17.0`` + + - | ``kafka`` extra: + | ``kafka-python``: ``>=1.0.0`` → ``>=1.4.3`` + | ``twisted`` (``>=20.3.0``) is now a dependency + + - | ``sql`` extra: + | ``cachetools``: ``>=0.4.0`` + | ``SQLAlchemy``: ``>=1.0.0`` → ``>=1.0.0,<1.4`` + + - | ``zeromq`` extra: + | ``pyzmq``: ``>=19.0.2`` + +- New extras: ``s3``, ``scrapy``. + + +Earlier releases +================ + +Find the earlier commit history `at GitHub +`_. diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index d025d4863..000000000 --- a/MANIFEST.in +++ /dev/null @@ -1,2 +0,0 @@ -include versioneer.py -include frontera/_version.py diff --git a/docs/source/_ext/fronteradocs.py b/docs/source/_ext/fronteradocs.py index 06e68ddbe..d3d2561fc 100644 --- a/docs/source/_ext/fronteradocs.py +++ b/docs/source/_ext/fronteradocs.py @@ -1,8 +1,7 @@ -from docutils.parsers.rst.roles import set_classes from docutils import nodes +from docutils.parsers.rst.roles import set_classes - -REPO = 'https://github.com/scrapinghub/frontera/' +REPO = "https://github.com/scrapinghub/frontera/" def setup(app): @@ -11,34 +10,51 @@ def setup(app): rolename="setting", indextemplate="pair: %s; setting", ) - app.add_role('source', source_role) - app.add_role('commit', commit_role) - app.add_role('issue', issue_role) - app.add_role('rev', rev_role) - - -def source_role(name, rawtext, text, lineno, inliner, options={}, content=[]): - ref = REPO + 'blob/master/' + text + app.add_role("source", source_role) + app.add_role("commit", commit_role) + app.add_role("issue", issue_role) + app.add_role("rev", rev_role) + + +def source_role(name, rawtext, text, lineno, inliner, options=None, content=None): + if content is None: + content = [] + if options is None: + options = {} + ref = REPO + "blob/master/" + text set_classes(options) node = nodes.reference(rawtext, text, refuri=ref, **options) return [node], [] -def issue_role(name, rawtext, text, lineno, inliner, options={}, content=[]): - ref = REPO + 'issues/' + text +def issue_role(name, rawtext, text, lineno, inliner, options=None, content=None): + if content is None: + content = [] + if options is None: + options = {} + ref = REPO + "issues/" + text set_classes(options) - node = nodes.reference(rawtext, 'issue ' + text, refuri=ref, **options) + node = nodes.reference(rawtext, "issue " + text, refuri=ref, **options) return [node], [] -def commit_role(name, rawtext, text, lineno, inliner, options={}, content=[]): - ref = REPO + 'commit/' + text +def commit_role(name, rawtext, text, lineno, inliner, options=None, content=None): + if content is None: + content = [] + if options is None: + options = {} + ref = REPO + "commit/" + text set_classes(options) node = nodes.reference(rawtext, text, refuri=ref, **options) return [node], [] -def rev_role(name, rawtext, text, lineno, inliner, options={}, content=[]): - ref = REPO + 'changeset/' + text + +def rev_role(name, rawtext, text, lineno, inliner, options=None, content=None): + if content is None: + content = [] + if options is None: + options = {} + ref = REPO + "changeset/" + text set_classes(options) - node = nodes.reference(rawtext, 'r' + text, refuri=ref, **options) - return [node], [] \ No newline at end of file + node = nodes.reference(rawtext, "r" + text, refuri=ref, **options) + return [node], [] diff --git a/docs/source/conf.py b/docs/source/conf.py index e720fc4b1..b0799f5e5 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # frontera documentation build configuration file, created by # sphinx-quickstart on Tue Nov 18 17:54:50 2014. @@ -12,62 +11,63 @@ # All configuration values have a default; values that are commented out # serve to show the default. +import os import sys -from os import path +from pathlib import Path # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -dir = path.dirname(__file__) -sys.path.extend([path.join(dir, "_ext"), path.join(dir, "../../")]) +DOCS_SRC_DIR = Path(__file__).parent +sys.path.extend([str(DOCS_SRC_DIR / "_ext"), str(DOCS_SRC_DIR / "../../")]) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'fronteradocs', + "sphinx.ext.autodoc", + "fronteradocs", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['ytemplates'] +templates_path = ["ytemplates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'Frontera' -copyright = u'2014-2016, Frontera authors' +project = "Frontera" +copyright = "Frontera authors" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '0.7' +version = "0.7" # The full version, including alpha/beta/rc tags. -release = '0.7.1' +release = "0.7.1" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -75,167 +75,161 @@ # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False +# keep_warnings = False # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'default' +html_theme = "default" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['ystatic'] +html_static_path = ["ystatic"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -#html_extra_path = [] +# html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'fronteradoc' +htmlhelp_basename = "fronteradoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ('index', 'frontera.tex', u'Frontera Documentation', - u'ScrapingHub', 'manual'), + ("index", "frontera.tex", "Frontera Documentation", "ScrapingHub", "manual"), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'frontera', u'Frontera Documentation', - [u'ScrapingHub'], 1) -] +man_pages = [("index", "frontera", "Frontera Documentation", ["ScrapingHub"], 1)] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------- @@ -244,35 +238,41 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'frontera', u'Frontera Documentation', - u'ScrapingHub', 'frontera', 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "frontera", + "Frontera Documentation", + "ScrapingHub", + "frontera", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False # -- Options for sphinx_rtd_theme ----------------------------------------- -#https://github.com/snide/sphinx_rtd_theme +# https://github.com/snide/sphinx_rtd_theme -import os -on_rtd = os.environ.get('READTHEDOCS', None) == 'True' +on_rtd = os.environ.get("READTHEDOCS", None) == "True" if on_rtd: - html_theme = 'default' + html_theme = "default" else: import sphinx_rtd_theme + html_theme = "sphinx_rtd_theme" html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # -- Options for autoclass ------------------------------------------------ # Use class and init docstrings for autoclass directive -autoclass_content = 'both' +autoclass_content = "both" diff --git a/docs/source/topics/frontera-settings.rst b/docs/source/topics/frontera-settings.rst index 805fc52f7..3ea99ab35 100644 --- a/docs/source/topics/frontera-settings.rst +++ b/docs/source/topics/frontera-settings.rst @@ -50,7 +50,7 @@ How to access settings def from_manager(cls, manager): manager = crawler.settings if settings.TEST_MODE: - print "test mode is enabled!" + print("test mode is enabled!") In other words, settings can be accessed as attributes of the :class:`Settings ` object. diff --git a/docs/source/topics/frontier-tester.rst b/docs/source/topics/frontier-tester.rst index eef48bfe9..39f4cb5eb 100644 --- a/docs/source/topics/frontier-tester.rst +++ b/docs/source/topics/frontier-tester.rst @@ -76,11 +76,11 @@ A working example using test data from graphs and :ref:`basic backends = self.max_pages_per_hostname: - self.logger.debug("Reached per host limit for URL %s, " - "already scheduled %d of %d allowed.", link.url, counts[hostname], - self.max_pages_per_hostname) + self.logger.debug( + "Reached per host limit for URL %s, " + "already scheduled %d of %d allowed.", + link.url, + counts[hostname], + self.max_pages_per_hostname, + ) continue - path_parts = url_parts.path.split('/') + path_parts = url_parts.path.split("/") score = 0.5 / (max(len(path_parts), 1.0) + len(url_parts.path) * 0.1) self.schedule(link, score) counts[hostname] += 1 if counts[hostname] == self.max_pages_per_hostname: - self.logger.info("Reached per host limit for domain %s (%d)", hostname, self.max_pages_per_hostname) + self.logger.info( + "Reached per host limit for domain %s (%d)", + hostname, + self.max_pages_per_hostname, + ) - for hostname, count in six.iteritems(counts): + for hostname, count in counts.items(): domain = self.domain_cache.setdefault(hostname, {}) - domain['sc'] = domain.get('sc', 0)+count + domain["sc"] = domain.get("sc", 0) + count def _get_domain_bucket(self, url): parsed = urlparse.urlsplit(url) - hostname, _, _ = parsed.netloc.partition(':') + hostname, _, _ = parsed.netloc.partition(":") return self.domain_cache.setdefault(hostname, {}) def close(self): self.domain_cache.flush() - super(BCPerHostLimit, self).close() + super().close() diff --git a/examples/cluster/bc/config/__init__.py b/examples/cluster/bc/config/__init__.py index 7c68785e9..e69de29bb 100644 --- a/examples/cluster/bc/config/__init__.py +++ b/examples/cluster/bc/config/__init__.py @@ -1 +0,0 @@ -# -*- coding: utf-8 -*- \ No newline at end of file diff --git a/examples/cluster/bc/config/common.py b/examples/cluster/bc/config/common.py index b7d90613f..3a6aff588 100644 --- a/examples/cluster/bc/config/common.py +++ b/examples/cluster/bc/config/common.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- from frontera.settings.default_settings import MIDDLEWARES MAX_NEXT_REQUESTS = 512 @@ -6,18 +5,20 @@ SPIDER_LOG_PARTITIONS = 1 DELAY_ON_EMPTY = 5.0 -MIDDLEWARES.extend([ - 'frontera.contrib.middlewares.domain.DomainMiddleware', - 'frontera.contrib.middlewares.fingerprint.DomainFingerprintMiddleware' -]) +MIDDLEWARES.extend( + [ + "frontera.contrib.middlewares.domain.DomainMiddleware", + "frontera.contrib.middlewares.fingerprint.DomainFingerprintMiddleware", + ] +) -#-------------------------------------------------------- +# -------------------------------------------------------- # Crawl frontier backend -#-------------------------------------------------------- +# -------------------------------------------------------- QUEUE_HOSTNAME_PARTITIONING = True -URL_FINGERPRINT_FUNCTION='frontera.utils.fingerprint.hostname_local_fingerprint' +URL_FINGERPRINT_FUNCTION = "frontera.utils.fingerprint.hostname_local_fingerprint" -#MESSAGE_BUS='frontera.contrib.messagebus.kafkabus.MessageBus' -#KAFKA_LOCATION = 'localhost:9092' -#SCORING_GROUP = 'scrapy-scoring' -#SCORING_TOPIC = 'frontier-score' \ No newline at end of file +# MESSAGE_BUS='frontera.contrib.messagebus.kafkabus.MessageBus' +# KAFKA_LOCATION = 'localhost:9092' +# SCORING_GROUP = 'scrapy-scoring' +# SCORING_TOPIC = 'frontier-score' diff --git a/examples/cluster/bc/config/dbw.py b/examples/cluster/bc/config/dbw.py index 8c6f76d48..0489d1c59 100644 --- a/examples/cluster/bc/config/dbw.py +++ b/examples/cluster/bc/config/dbw.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import from .worker import * -LOGGING_CONFIG='logging-db.conf' \ No newline at end of file +LOGGING_CONFIG = "logging-db.conf" diff --git a/examples/cluster/bc/config/spider.py b/examples/cluster/bc/config/spider.py index c6721e78d..0d6a652c4 100644 --- a/examples/cluster/bc/config/spider.py +++ b/examples/cluster/bc/config/spider.py @@ -1,6 +1,4 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import from .common import * -BACKEND = 'frontera.contrib.backends.remote.messagebus.MessageBusBackend' +BACKEND = "frontera.contrib.backends.remote.messagebus.MessageBusBackend" KAFKA_GET_TIMEOUT = 0.5 diff --git a/examples/cluster/bc/config/sw.py b/examples/cluster/bc/config/sw.py index b3d720e5c..1eb197c61 100644 --- a/examples/cluster/bc/config/sw.py +++ b/examples/cluster/bc/config/sw.py @@ -1,7 +1,4 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import from .worker import * -LOGGING_CONFIG='logging-sw.conf' +LOGGING_CONFIG = "logging-sw.conf" MAX_PAGES_PER_HOSTNAME = 10 - diff --git a/examples/cluster/bc/config/worker.py b/examples/cluster/bc/config/worker.py index 045ce7260..31d582006 100644 --- a/examples/cluster/bc/config/worker.py +++ b/examples/cluster/bc/config/worker.py @@ -1,9 +1,7 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import from .common import * -BACKEND = 'frontera.contrib.backends.hbase.HBaseBackend' +BACKEND = "frontera.contrib.backends.hbase.HBaseBackend" HBASE_DROP_ALL_TABLES = True MAX_NEXT_REQUESTS = 2048 -NEW_BATCH_DELAY = 3.0 \ No newline at end of file +NEW_BATCH_DELAY = 3.0 diff --git a/examples/cluster/bc/items.py b/examples/cluster/bc/items.py index eef9562e9..39c688e1a 100644 --- a/examples/cluster/bc/items.py +++ b/examples/cluster/bc/items.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - # Define here the models for your scraped items # # See documentation in: diff --git a/examples/cluster/bc/pipelines.py b/examples/cluster/bc/pipelines.py index 817f23472..618e61f4b 100644 --- a/examples/cluster/bc/pipelines.py +++ b/examples/cluster/bc/pipelines.py @@ -1,11 +1,9 @@ -# -*- coding: utf-8 -*- - # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html -class BcPipeline(object): +class BcPipeline: def process_item(self, item, spider): return item diff --git a/examples/cluster/bc/settings.py b/examples/cluster/bc/settings.py index 306b97736..7960deb72 100644 --- a/examples/cluster/bc/settings.py +++ b/examples/cluster/bc/settings.py @@ -1,41 +1,37 @@ -# -*- coding: utf-8 -*- -FRONTERA_SETTINGS = 'bc.config.spider' +FRONTERA_SETTINGS = "bc.config.spider" -SCHEDULER = 'frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler' +SCHEDULER = "frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler" SPIDER_MIDDLEWARES = { - 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 999, - 'frontera.contrib.scrapy.middlewares.seeds.file.FileSeedLoader': 1, + "frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware": 999, + "frontera.contrib.scrapy.middlewares.seeds.file.FileSeedLoader": 1, } DOWNLOADER_MIDDLEWARES = { - 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware': 999, + "frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware": 999, } -BOT_NAME = 'bc' +BOT_NAME = "bc" -SPIDER_MODULES = ['bc.spiders'] -NEWSPIDER_MODULE = 'bc.spiders' +SPIDER_MODULES = ["bc.spiders"] +NEWSPIDER_MODULE = "bc.spiders" -CONCURRENT_REQUESTS=256 -CONCURRENT_REQUESTS_PER_DOMAIN=1 +CONCURRENT_REQUESTS = 256 +CONCURRENT_REQUESTS_PER_DOMAIN = 1 -DOWNLOAD_DELAY=0.0 -DOWNLOAD_TIMEOUT=180 +DOWNLOAD_DELAY = 0.0 +DOWNLOAD_TIMEOUT = 180 RANDOMIZE_DOWNLOAD_DELAY = False REACTOR_THREADPOOL_MAXSIZE = 30 DNS_TIMEOUT = 120 -COOKIES_ENABLED=False +COOKIES_ENABLED = False RETRY_ENABLED = False REDIRECT_ENABLED = True AJAXCRAWL_ENABLED = False -AUTOTHROTTLE_ENABLED=True -AUTOTHROTTLE_START_DELAY=0.01 +AUTOTHROTTLE_ENABLED = True +AUTOTHROTTLE_START_DELAY = 0.01 AUTOTHROTTLE_MAX_DELAY = 3.0 -AUTOTHROTTLE_DEBUG=False - -LOG_LEVEL='INFO' - - +AUTOTHROTTLE_DEBUG = False +LOG_LEVEL = "INFO" diff --git a/examples/cluster/bc/spiders/bc.py b/examples/cluster/bc/spiders/bc.py index ac3ee3bf3..dda3f2045 100644 --- a/examples/cluster/bc/spiders/bc.py +++ b/examples/cluster/bc/spiders/bc.py @@ -1,15 +1,16 @@ -# -*- coding: utf-8 -*- -from scrapy.spider import Spider +from scrapy import signals +from scrapy.exceptions import DontCloseSpider from scrapy.http import Request from scrapy.http.response.html import HtmlResponse from scrapy.linkextractors import LinkExtractor -from scrapy import signals +from scrapy.spider import Spider + class BCSpider(Spider): - name = 'bc' + name = "bc" def __init__(self, *args, **kwargs): - super(BCSpider, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) self.le = LinkExtractor() def parse(self, response): @@ -23,7 +24,7 @@ def parse(self, response): @classmethod def from_crawler(cls, crawler, *args, **kwargs): - spider = super(BCSpider, cls).from_crawler(crawler, *args, **kwargs) + spider = super().from_crawler(crawler, *args, **kwargs) spider._set_crawler(crawler) spider.crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle) return spider diff --git a/examples/general-spider/frontier/__init__.py b/examples/general-spider/frontier/__init__.py index 7c68785e9..e69de29bb 100644 --- a/examples/general-spider/frontier/__init__.py +++ b/examples/general-spider/frontier/__init__.py @@ -1 +0,0 @@ -# -*- coding: utf-8 -*- \ No newline at end of file diff --git a/examples/general-spider/frontier/single.py b/examples/general-spider/frontier/single.py index f86c135bd..e13e7b5d4 100644 --- a/examples/general-spider/frontier/single.py +++ b/examples/general-spider/frontier/single.py @@ -1,12 +1,11 @@ -# -*- coding: utf-8 -*- -import logging +from datetime import timedelta -BACKEND = 'frontera.contrib.backends.sqlalchemy.revisiting.Backend' -SQLALCHEMYBACKEND_ENGINE = 'sqlite:///url_storage.sqlite' +BACKEND = "frontera.contrib.backends.sqlalchemy.revisiting.Backend" +SQLALCHEMYBACKEND_ENGINE = "sqlite:///url_storage.sqlite" SQLALCHEMYBACKEND_ENGINE_ECHO = False SQLALCHEMYBACKEND_DROP_ALL_TABLES = False SQLALCHEMYBACKEND_CLEAR_CONTENT = False -from datetime import timedelta + SQLALCHEMYBACKEND_REVISIT_INTERVAL = timedelta(days=3) DELAY_ON_EMPTY = 20.0 diff --git a/examples/general-spider/frontier/spider_settings.py b/examples/general-spider/frontier/spider_settings.py index 2e35ff6cb..3575b82af 100644 --- a/examples/general-spider/frontier/spider_settings.py +++ b/examples/general-spider/frontier/spider_settings.py @@ -1,18 +1,17 @@ -# -*- coding: utf-8 -*- from frontera.settings.default_settings import MIDDLEWARES MAX_NEXT_REQUESTS = 256 DELAY_ON_EMPTY = 5.0 -MIDDLEWARES.extend([ - 'frontera.contrib.middlewares.domain.DomainMiddleware', - 'frontera.contrib.middlewares.fingerprint.DomainFingerprintMiddleware' -]) +MIDDLEWARES.extend( + [ + "frontera.contrib.middlewares.domain.DomainMiddleware", + "frontera.contrib.middlewares.fingerprint.DomainFingerprintMiddleware", + ] +) -#-------------------------------------------------------- +# -------------------------------------------------------- # Crawl frontier backend -#-------------------------------------------------------- -BACKEND = 'frontera.contrib.backends.remote.messagebus.MessageBusBackend' +# -------------------------------------------------------- +BACKEND = "frontera.contrib.backends.remote.messagebus.MessageBusBackend" SPIDER_FEED_PARTITIONS = 2 - - diff --git a/examples/general-spider/frontier/workersettings.py b/examples/general-spider/frontier/workersettings.py index fa0e59adf..5107da90b 100644 --- a/examples/general-spider/frontier/workersettings.py +++ b/examples/general-spider/frontier/workersettings.py @@ -1,31 +1,32 @@ -# -*- coding: utf-8 -*- +from datetime import timedelta + from frontera.settings.default_settings import MIDDLEWARES MAX_NEXT_REQUESTS = 512 SPIDER_FEED_PARTITIONS = 2 SPIDER_LOG_PARTITIONS = 1 -#-------------------------------------------------------- +# -------------------------------------------------------- # Url storage -#-------------------------------------------------------- +# -------------------------------------------------------- -BACKEND = 'frontera.contrib.backends.sqlalchemy.SQLAlchemyBackend' -#BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' +BACKEND = "frontera.contrib.backends.sqlalchemy.SQLAlchemyBackend" +# BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' -SQLALCHEMYBACKEND_ENGINE = 'sqlite:///url_storage_dist.sqlite' +SQLALCHEMYBACKEND_ENGINE = "sqlite:///url_storage_dist.sqlite" SQLALCHEMYBACKEND_ENGINE_ECHO = False SQLALCHEMYBACKEND_DROP_ALL_TABLES = True SQLALCHEMYBACKEND_CLEAR_CONTENT = True -from datetime import timedelta -SQLALCHEMYBACKEND_REVISIT_INTERVAL = timedelta(days=3) - -MIDDLEWARES.extend([ - 'frontera.contrib.middlewares.domain.DomainMiddleware', - 'frontera.contrib.middlewares.fingerprint.DomainFingerprintMiddleware' -]) +SQLALCHEMYBACKEND_REVISIT_INTERVAL = timedelta(days=3) -LOGGING_CONFIG='logging.conf' +MIDDLEWARES.extend( + [ + "frontera.contrib.middlewares.domain.DomainMiddleware", + "frontera.contrib.middlewares.fingerprint.DomainFingerprintMiddleware", + ] +) +LOGGING_CONFIG = "logging.conf" diff --git a/examples/general-spider/general/settings.py b/examples/general-spider/general/settings.py index e29b425e7..6d33e7983 100644 --- a/examples/general-spider/general/settings.py +++ b/examples/general-spider/general/settings.py @@ -1,26 +1,25 @@ -# -*- coding: utf-8 -*- -BOT_NAME = 'general' +BOT_NAME = "general" -SPIDER_MODULES = ['general.spiders'] -NEWSPIDER_MODULE = 'general.spiders' +SPIDER_MODULES = ["general.spiders"] +NEWSPIDER_MODULE = "general.spiders" # Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'topic (+http://www.yourdomain.com)' +# USER_AGENT = 'topic (+http://www.yourdomain.com)' SPIDER_MIDDLEWARES = { - 'frontera.contrib.scrapy.middlewares.seeds.file.FileSeedLoader': 1, - 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 1000, - 'scrapy.spidermiddleware.depth.DepthMiddleware': None, - 'scrapy.spidermiddleware.offsite.OffsiteMiddleware': None, - 'scrapy.spidermiddleware.referer.RefererMiddleware': None, - 'scrapy.spidermiddleware.urllength.UrlLengthMiddleware': None + "frontera.contrib.scrapy.middlewares.seeds.file.FileSeedLoader": 1, + "frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware": 1000, + "scrapy.spidermiddleware.depth.DepthMiddleware": None, + "scrapy.spidermiddleware.offsite.OffsiteMiddleware": None, + "scrapy.spidermiddleware.referer.RefererMiddleware": None, + "scrapy.spidermiddleware.urllength.UrlLengthMiddleware": None, } DOWNLOADER_MIDDLEWARES = { - 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware': 1000, + "frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware": 1000, } -SCHEDULER = 'frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler' +SCHEDULER = "frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler" HTTPCACHE_ENABLED = False @@ -28,7 +27,7 @@ COOKIES_ENABLED = False DOWNLOAD_TIMEOUT = 240 RETRY_ENABLED = False -DOWNLOAD_MAXSIZE = 1*1024*1024 +DOWNLOAD_MAXSIZE = 1 * 1024 * 1024 # auto throttling AUTOTHROTTLE_ENABLED = True @@ -42,7 +41,7 @@ CONCURRENT_REQUESTS_PER_DOMAIN = 10 DOWNLOAD_DELAY = 0.0 -LOG_LEVEL = 'INFO' +LOG_LEVEL = "INFO" REACTOR_THREADPOOL_MAXSIZE = 32 DNS_TIMEOUT = 180 diff --git a/examples/general-spider/general/spiders/__init__.py b/examples/general-spider/general/spiders/__init__.py index 8b1378917..e69de29bb 100644 --- a/examples/general-spider/general/spiders/__init__.py +++ b/examples/general-spider/general/spiders/__init__.py @@ -1 +0,0 @@ - diff --git a/examples/general-spider/general/spiders/general_spider.py b/examples/general-spider/general/spiders/general_spider.py index 6a28bfff5..b0b98d780 100644 --- a/examples/general-spider/general/spiders/general_spider.py +++ b/examples/general-spider/general/spiders/general_spider.py @@ -1,14 +1,14 @@ -from scrapy.spider import Spider from scrapy.http import Request from scrapy.http.response.html import HtmlResponse from scrapy.linkextractors import LinkExtractor +from scrapy.spider import Spider class GeneralSpider(Spider): - name = 'general' + name = "general" def __init__(self, *args, **kwargs): - super(GeneralSpider, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) self.le = LinkExtractor() def parse(self, response): @@ -19,4 +19,3 @@ def parse(self, response): r = Request(url=link.url) r.meta.update(link_text=link.text) yield r - diff --git a/examples/grequests/links_follower.py b/examples/grequests/links_follower.py index 08225ef11..1f759d37e 100644 --- a/examples/grequests/links_follower.py +++ b/examples/grequests/links_follower.py @@ -1,45 +1,37 @@ -from __future__ import print_function - import re from time import time +from urllib.parse import urljoin -from grequests import AsyncRequest, get as grequests_get, map as grequests_map +from grequests import AsyncRequest +from grequests import get as grequests_get +from grequests import map as grequests_map +from frontera import Settings +from frontera.contrib.requests.converters import ResponseConverter +from frontera.core import get_slot_key from frontera.core.models import Request as FrontierRequest from frontera.utils.converters import BaseRequestConverter -from frontera.contrib.requests.converters import ResponseConverter - from frontera.utils.managers import FrontierManagerWrapper -from frontera.core import get_slot_key -from frontera import Settings - -from six import iteritems -from six.moves.urllib.parse import urljoin - SETTINGS = Settings() -SETTINGS.BACKEND = 'frontera.contrib.backends.memory.MemoryDFSOverusedBackend' +SETTINGS.BACKEND = "frontera.contrib.backends.memory.MemoryDFSOverusedBackend" SETTINGS.LOGGING_MANAGER_ENABLED = True SETTINGS.LOGGING_BACKEND_ENABLED = False SETTINGS.MAX_REQUESTS = 0 SETTINGS.MAX_NEXT_REQUESTS = 40 -SEEDS = [ - 'http://www.imdb.com', - 'http://www.bbc.com/', - 'http://www.amazon.com/' -] +SEEDS = ["http://www.imdb.com", "http://www.bbc.com/", "http://www.amazon.com/"] -LINK_RE = re.compile(r'', re.I) +LINK_RE = re.compile(r'', re.IGNORECASE) class GRequestsConverter(BaseRequestConverter): """Converts between frontera and grequests request objects""" + @classmethod def to_frontier(cls, request): """request: AsyncRequest > Frontier""" - return FrontierRequest(url=request.url, - method=request.method) + return FrontierRequest(url=request.url, method=request.method) @classmethod def from_frontier(cls, request): @@ -49,17 +41,17 @@ def from_frontier(cls, request): class GRequestsFrontierManager(FrontierManagerWrapper): def __init__(self, settings): - super(GRequestsFrontierManager, self).__init__(settings) + super().__init__(settings) self.request_converter = GRequestsConverter() self.response_converter = ResponseConverter(self.request_converter) -class HostnameStatistics(object): +class HostnameStatistics: def __init__(self): self.stats = {} def on_request(self, request): - key = get_slot_key(request, 'domain') + key = get_slot_key(request, "domain") self.stats[key] = time() def collect_overused_keys(self): @@ -67,8 +59,9 @@ def collect_overused_keys(self): return [ key - for key, timestamp in iteritems(self.stats) - if ts - timestamp < 5.0 # querying each hostname with at least 5 seconds delay + for key, timestamp in self.stats.items() + if ts - timestamp + < 5.0 # querying each hostname with at least 5 seconds delay ] @@ -84,15 +77,15 @@ def extract_page_links(response): """ -if __name__ == '__main__': - +if __name__ == "__main__": frontier = GRequestsFrontierManager(SETTINGS) stats = HostnameStatistics() frontier.add_seeds([grequests_get(url=url.strip()) for url in SEEDS]) while True: + def error_handler(request, exception): - print('Failed to process request', request.url, 'Error:', exception) + print("Failed to process request", request.url, "Error:", exception) frontier.request_error(request, str(exception)) def callback(response, **kwargs): @@ -103,17 +96,17 @@ def callback(response, **kwargs): frontier.links_extracted(response.request, links) frontier.page_crawled(response) - print('Crawled', response.url, '(found', len(links), 'urls)') + print("Crawled", response.url, "(found", len(links), "urls)") next_requests = frontier.get_next_requests( frontier.manager.max_next_requests, - key_type='domain', + key_type="domain", overused_keys=stats.collect_overused_keys(), ) if not next_requests: continue for r in next_requests: - r.kwargs['hooks'] = {'response': callback} + r.kwargs["hooks"] = {"response": callback} grequests_map(next_requests, size=10, exception_handler=error_handler) diff --git a/examples/requests/links_follower.py b/examples/requests/links_follower.py index 57ff9b877..ce23d2d3d 100644 --- a/examples/requests/links_follower.py +++ b/examples/requests/links_follower.py @@ -1,34 +1,30 @@ -from __future__ import print_function - import re +from urllib.parse import urljoin import requests -from frontera.contrib.requests.manager import RequestsFrontierManager from frontera import Settings - -from six.moves.urllib.parse import urljoin - +from frontera.contrib.requests.manager import RequestsFrontierManager SETTINGS = Settings() -SETTINGS.BACKEND = 'frontera.contrib.backends.memory.FIFO' +SETTINGS.BACKEND = "frontera.contrib.backends.memory.FIFO" SETTINGS.LOGGING_MANAGER_ENABLED = True SETTINGS.LOGGING_BACKEND_ENABLED = True SETTINGS.MAX_REQUESTS = 100 SETTINGS.MAX_NEXT_REQUESTS = 10 SEEDS = [ - 'http://www.imdb.com', + "http://www.imdb.com", ] -LINK_RE = re.compile(r'', re.I) +LINK_RE = re.compile(r'', re.IGNORECASE) def extract_page_links(response): return [urljoin(response.url, link) for link in LINK_RE.findall(response.text)] -if __name__ == '__main__': +if __name__ == "__main__": frontier = RequestsFrontierManager(SETTINGS) frontier.add_seeds([requests.Request(url=url) for url in SEEDS]) while True: @@ -36,18 +32,17 @@ def extract_page_links(response): if not next_requests: break for request in next_requests: - try: - response = requests.get(request.url) - links = [ - requests.Request(url=url) - for url in extract_page_links(response) - ] - frontier.page_crawled(response) - print('Crawled', response.url, '(found', len(links), 'urls)') - - if links: - frontier.links_extracted(request, links) - except requests.RequestException as e: - error_code = type(e).__name__ - frontier.request_error(request, error_code) - print('Failed to process request', request.url, 'Error:', e) + try: + response = requests.get(request.url) + links = [ + requests.Request(url=url) for url in extract_page_links(response) + ] + frontier.page_crawled(response) + print("Crawled", response.url, "(found", len(links), "urls)") + + if links: + frontier.links_extracted(request, links) + except requests.RequestException as e: + error_code = type(e).__name__ + frontier.request_error(request, error_code) + print("Failed to process request", request.url, "Error:", e) diff --git a/examples/scrapy_recording/scrapy_recording/render_recording.py b/examples/scrapy_recording/scrapy_recording/render_recording.py index 4fa2ce8d5..113109de0 100644 --- a/examples/scrapy_recording/scrapy_recording/render_recording.py +++ b/examples/scrapy_recording/scrapy_recording/render_recording.py @@ -1,5 +1,9 @@ from frontera import graphs -graph = graphs.Manager('sqlite:///recordings/record.db') -graph.render(filename='recordings/record.png', label='Record graph', use_urls=True, include_ids=True) - +graph = graphs.Manager("sqlite:///recordings/record.db") +graph.render( + filename="recordings/record.png", + label="Record graph", + use_urls=True, + include_ids=True, +) diff --git a/examples/scrapy_recording/scrapy_recording/settings.py b/examples/scrapy_recording/scrapy_recording/settings.py index c77502824..062d2feaf 100644 --- a/examples/scrapy_recording/scrapy_recording/settings.py +++ b/examples/scrapy_recording/scrapy_recording/settings.py @@ -1,10 +1,10 @@ -#-------------------------------------------------------------------------- +# -------------------------------------------------------------------------- # Scrapy Settings -#-------------------------------------------------------------------------- -BOT_NAME = 'scrapy_frontier' +# -------------------------------------------------------------------------- +BOT_NAME = "scrapy_frontier" -SPIDER_MODULES = ['scrapy_recording.spiders'] -NEWSPIDER_MODULE = 'scrapy_recording.spiders' +SPIDER_MODULES = ["scrapy_recording.spiders"] +NEWSPIDER_MODULE = "scrapy_recording.spiders" HTTPCACHE_ENABLED = True REDIRECT_ENABLED = True @@ -17,18 +17,18 @@ LOGSTATS_INTERVAL = 10 -#-------------------------------------------------------------------------- +# -------------------------------------------------------------------------- # Recorder Settings -#-------------------------------------------------------------------------- +# -------------------------------------------------------------------------- SPIDER_MIDDLEWARES = { - 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 999 + "frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware": 999 } DOWNLOADER_MIDDLEWARES = { - 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware': 999 + "frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware": 999 } -SCHEDULER = 'frontera.contrib.scrapy.schedulers.recording.RecorderScheduler' +SCHEDULER = "frontera.contrib.scrapy.schedulers.recording.RecorderScheduler" RECORDER_ENABLED = True -RECORDER_STORAGE_ENGINE = 'sqlite:///scrapy_recording/recordings/record.db' +RECORDER_STORAGE_ENGINE = "sqlite:///scrapy_recording/recordings/record.db" RECORDER_STORAGE_DROP_ALL_TABLES = True RECORDER_STORAGE_CLEAR_CONTENT = True diff --git a/examples/scrapy_recording/scrapy_recording/spiders/recorder.py b/examples/scrapy_recording/scrapy_recording/spiders/recorder.py index d53c10de0..574c6a420 100644 --- a/examples/scrapy_recording/scrapy_recording/spiders/recorder.py +++ b/examples/scrapy_recording/scrapy_recording/spiders/recorder.py @@ -2,32 +2,39 @@ from scrapy.linkextractors.regex import RegexLinkExtractor from scrapy.spiders import CrawlSpider, Rule +DOMAIN = "diffeo.com" +ALLOWED_RE = "http://" + DOMAIN -DOMAIN = 'diffeo.com' -ALLOWED_RE = 'http://' + DOMAIN - -class FallbackLinkExtractor(object): +class FallbackLinkExtractor: def __init__(self, extractors): self.extractors = extractors def extract_links(self, response): for lx in self.extractors: - links = lx.extract_links(response) - return links + return lx.extract_links(response) + return None class MySpider(CrawlSpider): - name = 'recorder' + name = "recorder" start_urls = [ - 'http://' + DOMAIN, + "http://" + DOMAIN, ] allowed_domains = [DOMAIN] - rules = [Rule(FallbackLinkExtractor([ - LinkExtractor(allow=ALLOWED_RE), - RegexLinkExtractor(allow=ALLOWED_RE), - ]), callback='parse_page', follow=True)] + rules = [ + Rule( + FallbackLinkExtractor( + [ + LinkExtractor(allow=ALLOWED_RE), + RegexLinkExtractor(allow=ALLOWED_RE), + ] + ), + callback="parse_page", + follow=True, + ) + ] def parse_page(self, response): pass diff --git a/examples/scrapy_recording/scrapy_recording/test_frontier.py b/examples/scrapy_recording/scrapy_recording/test_frontier.py index f72e2f3d6..13cbfae21 100644 --- a/examples/scrapy_recording/scrapy_recording/test_frontier.py +++ b/examples/scrapy_recording/scrapy_recording/test_frontier.py @@ -1,18 +1,19 @@ """ Frontier tester using recording data """ + from frontera import FrontierManager, FrontierTester, Settings, graphs SETTINGS = Settings() -SETTINGS.BACKEND = 'frontera.contrib.backends.memory_heapq.FIFO' +SETTINGS.BACKEND = "frontera.contrib.backends.memory_heapq.FIFO" SETTINGS.LOGGING_MANAGER_ENABLED = True SETTINGS.LOGGING_BACKEND_ENABLED = True SETTINGS.LOGGING_DEBUGGING_ENABLED = False -if __name__ == '__main__': +if __name__ == "__main__": # Graph - graph = graphs.Manager('sqlite:///recordings/scrapinghub.com.db') + graph = graphs.Manager("sqlite:///recordings/scrapinghub.com.db") # Frontier frontier = FrontierManager.from_settings(SETTINGS) @@ -24,15 +25,15 @@ tester.run() # Show frontier pages - print '-'*80 - print ' Frontier pages' - print '-'*80 + print("-" * 80) + print(" Frontier pages") + print("-" * 80) for page in frontier.backend.pages.values(): - print page.url, page.depth, page.state + print(page.url, page.depth, page.state) # Show crawling sequence - print '-'*80 - print ' Crawling sequence' - print '-'*80 + print("-" * 80) + print(" Crawling sequence") + print("-" * 80) for page in tester.sequence: - print page.url + print(page.url) diff --git a/examples/scripts/01_site.py b/examples/scripts/01_site.py index 4a0e18ee5..17ada1a49 100644 --- a/examples/scripts/01_site.py +++ b/examples/scripts/01_site.py @@ -1,6 +1,7 @@ """ Graph manager example with single site """ + from frontera import graphs SITE = [ @@ -24,15 +25,15 @@ def test_site(site): graph.add_site(site) # Show graph pages - print '-'*80 + print("-" * 80) for page in graph.pages: - print page, page.status + print(page, page.status) # Show single page a_page = graph.get_page("A") - print a_page.url, [link.url for link in a_page.links] + print(a_page.url, [link.url for link in a_page.links]) -if __name__ == '__main__': +if __name__ == "__main__": test_site(SITE) - test_site(SITE_WITH_STATUS_CODES) \ No newline at end of file + test_site(SITE_WITH_STATUS_CODES) diff --git a/examples/scripts/02_site_list.py b/examples/scripts/02_site_list.py index 0fe9c76ce..94b1c9321 100644 --- a/examples/scripts/02_site_list.py +++ b/examples/scripts/02_site_list.py @@ -1,6 +1,7 @@ """ Graph manager example with site list """ + from frontera import graphs SITE_LIST = [ @@ -16,7 +17,7 @@ ], ] -if __name__ == '__main__': +if __name__ == "__main__": # Create graph graph = graphs.Manager() @@ -25,4 +26,4 @@ # Show graph pages for page in graph.pages: - print page + print(page) diff --git a/examples/scripts/03_graph_with_db.py b/examples/scripts/03_graph_with_db.py index eb631a365..12af58407 100644 --- a/examples/scripts/03_graph_with_db.py +++ b/examples/scripts/03_graph_with_db.py @@ -1,6 +1,7 @@ """ Graph manager with database """ + from frontera import graphs SITE_LIST = [ @@ -16,14 +17,13 @@ ], ] -if __name__ == '__main__': +if __name__ == "__main__": # Create graph with sqlite db - graph = graphs.Manager('sqlite:///data/graph.db', drop_all_tables=True) + graph = graphs.Manager("sqlite:///data/graph.db", drop_all_tables=True) # Add site list to graph graph.add_site_list(SITE_LIST) # Show graph pages for page in graph.pages: - print page - + print(page) diff --git a/examples/scripts/04_graph_from_db.py b/examples/scripts/04_graph_from_db.py index f6be390d1..ff71cc842 100644 --- a/examples/scripts/04_graph_from_db.py +++ b/examples/scripts/04_graph_from_db.py @@ -1,13 +1,13 @@ """ Graph manager reading data from database """ + from frontera import graphs -if __name__ == '__main__': +if __name__ == "__main__": # Create graph with sqlite db - graph = graphs.Manager('sqlite:///data/graph.db') + graph = graphs.Manager("sqlite:///data/graph.db") # Show graph pages for page in graph.pages: - print page - + print(page) diff --git a/examples/scripts/05_generate_diagrams.py b/examples/scripts/05_generate_diagrams.py index 232869ffe..517e1edfc 100644 --- a/examples/scripts/05_generate_diagrams.py +++ b/examples/scripts/05_generate_diagrams.py @@ -1,6 +1,7 @@ """ Graph diagram generation example """ + from frontera import graphs SITE_LIST_A = [ @@ -24,11 +25,12 @@ def generate_graph(site_list, filename, title, use_urls=False): - print 'Generating diagram "%s"...' % title + print(f'Generating diagram "{title}"...') graph = graphs.Manager() graph.add_site_list(site_list) graph.render(filename=filename, label=title, use_urls=use_urls) -if __name__ == '__main__': - generate_graph(SITE_LIST_A, 'diagrams/A.png', 'Example Graph A') - generate_graph(SITE_LIST_B, 'diagrams/B.png', 'Example Graph B', True) \ No newline at end of file + +if __name__ == "__main__": + generate_graph(SITE_LIST_A, "diagrams/A.png", "Example Graph A") + generate_graph(SITE_LIST_B, "diagrams/B.png", "Example Graph B", True) diff --git a/examples/scripts/06_frontier.py b/examples/scripts/06_frontier.py index aa69f3103..7eb984547 100644 --- a/examples/scripts/06_frontier.py +++ b/examples/scripts/06_frontier.py @@ -1,25 +1,27 @@ """ Frontier from parameters example """ -from frontera import FrontierManager, graphs, Request, Response -if __name__ == '__main__': +from frontera import FrontierManager, Request, Response, graphs + +if __name__ == "__main__": # Create graph - graph = graphs.Manager('sqlite:///data/graph.db') + graph = graphs.Manager("sqlite:///data/graph.db") # Create frontier frontier = FrontierManager( - request_model='frontera.core.models.Request', - response_model='frontera.core.models.Response', - backend='frontera.contrib.backends.memory.FIFO', - logger='frontera.logger.FrontierLogger', - event_log_manager='frontera.logger.events.EventLogManager', + request_model="frontera.core.models.Request", + response_model="frontera.core.models.Response", + backend="frontera.contrib.backends.memory.FIFO", + logger="frontera.logger.FrontierLogger", + event_log_manager="frontera.logger.events.EventLogManager", middlewares=[ - 'frontera.contrib.middlewares.domain.DomainMiddleware', - 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware', - 'frontera.contrib.middlewares.fingerprint.DomainFingerprintMiddleware', + "frontera.contrib.middlewares.domain.DomainMiddleware", + "frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware", + "frontera.contrib.middlewares.fingerprint.DomainFingerprintMiddleware", ], - test_mode=True) + test_mode=True, + ) # Add seeds frontier.add_seeds([Request(seed.url) for seed in graph.seeds]) @@ -29,14 +31,13 @@ # Crawl pages for request in next_requests: - # Fake page crawling crawled_page = graph.get_page(request.url) # Create response - response = Response(url=request.url, - status_code=crawled_page.status, - request=request) + response = Response( + url=request.url, status_code=crawled_page.status, request=request + ) # Create page links page_links = [Request(link.url) for link in crawled_page.links] diff --git a/examples/scripts/07_frontier_from_settings.py b/examples/scripts/07_frontier_from_settings.py index a64697d04..16b2ac466 100644 --- a/examples/scripts/07_frontier_from_settings.py +++ b/examples/scripts/07_frontier_from_settings.py @@ -1,18 +1,19 @@ """ Frontier initialization from settings """ -from frontera import FrontierManager, Settings, graphs, Request, Response + +from frontera import FrontierManager, Request, Response, Settings, graphs SETTINGS = Settings() -SETTINGS.BACKEND = 'frontera.contrib.backends.memory.FIFO' +SETTINGS.BACKEND = "frontera.contrib.backends.memory.FIFO" SETTINGS.LOGGING_MANAGER_ENABLED = True SETTINGS.LOGGING_BACKEND_ENABLED = True SETTINGS.LOGGING_DEBUGGING_ENABLED = True SETTINGS.TEST_MODE = True -if __name__ == '__main__': +if __name__ == "__main__": # Create graph - graph = graphs.Manager('sqlite:///data/graph.db') + graph = graphs.Manager("sqlite:///data/graph.db") # Create frontier from settings frontier = FrontierManager.from_settings(SETTINGS) @@ -25,14 +26,13 @@ # Crawl pages for request in next_requests: - # Fake page crawling crawled_page = graph.get_page(request.url) # Create response - response = Response(url=request.url, - status_code=crawled_page.status, - request=request) + response = Response( + url=request.url, status_code=crawled_page.status, request=request + ) # Create page links page_links = [Request(link.url) for link in crawled_page.links] diff --git a/examples/scripts/08_frontier_tester.py b/examples/scripts/08_frontier_tester.py index 1f942ad98..ff8e77828 100644 --- a/examples/scripts/08_frontier_tester.py +++ b/examples/scripts/08_frontier_tester.py @@ -1,11 +1,12 @@ """ Frontier tester usage example """ + from frontera import FrontierManager, FrontierTester, Settings, graphs -if __name__ == '__main__': +if __name__ == "__main__": # Graph - graph = graphs.Manager('sqlite:///data/graph.db') + graph = graphs.Manager("sqlite:///data/graph.db") # Frontier settings = Settings() @@ -23,4 +24,4 @@ # Show crawling sequence for page in tester.sequence: - print page.url + print(page.url) diff --git a/examples/scripts/09_frontier_backends.py b/examples/scripts/09_frontier_backends.py index 8f9006bfb..88d0c1b96 100644 --- a/examples/scripts/09_frontier_backends.py +++ b/examples/scripts/09_frontier_backends.py @@ -1,12 +1,13 @@ """ Test different frontier backends """ -from frontera import FrontierManager, Settings, FrontierTester, graphs + +from frontera import FrontierManager, FrontierTester, Settings, graphs def test_logic(backend): # Graph - graph = graphs.Manager('sqlite:///data/graph.db') + graph = graphs.Manager("sqlite:///data/graph.db") # Frontier settings = Settings() @@ -22,15 +23,16 @@ def test_logic(backend): tester.run(add_all_pages=True) # Show crawling sequence - print '-'*80 - print frontier.backend.name - print '-'*80 + print("-" * 80) + print(frontier.backend.name) + print("-" * 80) for page in tester.sequence: - print page.url - -if __name__ == '__main__': - test_logic('frontera.contrib.backends.memory.FIFO') - test_logic('frontera.contrib.backends.memory.LIFO') - test_logic('frontera.contrib.backends.memory.BFS') - test_logic('frontera.contrib.backends.memory.DFS') - test_logic('frontera.contrib.backends.memory.RANDOM') + print(page.url) + + +if __name__ == "__main__": + test_logic("frontera.contrib.backends.memory.FIFO") + test_logic("frontera.contrib.backends.memory.LIFO") + test_logic("frontera.contrib.backends.memory.BFS") + test_logic("frontera.contrib.backends.memory.DFS") + test_logic("frontera.contrib.backends.memory.RANDOM") diff --git a/examples/scripts/10_custom_backends.py b/examples/scripts/10_custom_backends.py index 3bd9313e8..7bb43cdb0 100644 --- a/examples/scripts/10_custom_backends.py +++ b/examples/scripts/10_custom_backends.py @@ -1,43 +1,48 @@ """ Custom backend example """ + import random -from frontera import FrontierManager, Settings, FrontierTester, graphs +from frontera import FrontierManager, FrontierTester, Settings, graphs from frontera.contrib.backends.memory import MemoryBaseBackend - SITE_LIST = [ - [('http://google.com', [])], - [('http://scrapinghub.com', [])], - [('http://zynga.com', [])], - [('http://microsoft.com', [])], - [('http://apple.com', [])], + [("http://google.com", [])], + [("http://scrapinghub.com", [])], + [("http://zynga.com", [])], + [("http://microsoft.com", [])], + [("http://apple.com", [])], ] +def _cmp(a, b): + return (a > b) - (a < b) + + class AlphabeticSortBackend(MemoryBaseBackend): """ Custom backend that sort pages alphabetically from url """ - name = 'Alphabetic domain name sort backend' + + name = "Alphabetic domain name sort backend" def _compare_pages(self, first, second): - return cmp(first.url, second.url) + return _cmp(first.url, second.url) class RandomSortBackend(MemoryBaseBackend): """ Custom backend that sort pages randomly """ - name = 'Random sort backend' + + name = "Random sort backend" def _compare_pages(self, first, second): return random.choice([-1, 0, 1]) def test_backend(backend): - # Graph graph = graphs.Manager() graph.add_site_list(SITE_LIST) @@ -50,9 +55,9 @@ def test_backend(backend): settings.LOGGING_DEBUGGING_ENABLED = False frontier = FrontierManager.from_settings(settings) - print '-'*80 - print frontier.backend.name - print '-'*80 + print("-" * 80) + print(frontier.backend.name) + print("-" * 80) # Tester tester = FrontierTester(frontier, graph) @@ -60,12 +65,9 @@ def test_backend(backend): # Show crawling sequence for page in tester.sequence: - print page.url - - -if __name__ == '__main__': - test_backend('10_custom_backends.AlphabeticSortBackend') - test_backend('10_custom_backends.RandomSortBackend') - + print(page.url) +if __name__ == "__main__": + test_backend("10_custom_backends.AlphabeticSortBackend") + test_backend("10_custom_backends.RandomSortBackend") diff --git a/frontera/__init__.py b/frontera/__init__.py index 8f97ddc24..f4f102ad1 100644 --- a/frontera/__init__.py +++ b/frontera/__init__.py @@ -1,10 +1,16 @@ -from __future__ import absolute_import +from .core.components import Backend, DistributedBackend, Middleware from .core.manager import FrontierManager from .core.models import Request, Response -from .core.components import Backend, DistributedBackend, Middleware from .settings import Settings from .utils.tester import FrontierTester -from ._version import get_versions -__version__ = get_versions()['version'] -del get_versions +__all__ = [ + "Backend", + "DistributedBackend", + "FrontierManager", + "FrontierTester", + "Middleware", + "Request", + "Response", + "Settings", +] diff --git a/frontera/_version.py b/frontera/_version.py deleted file mode 100644 index 80dd7a9be..000000000 --- a/frontera/_version.py +++ /dev/null @@ -1,209 +0,0 @@ - -# This file helps to compute a version number in source trees obtained from -# git-archive tarball (such as those provided by githubs download-from-tag -# feature). Distribution tarballs (built by setup.py sdist) and build -# directories (produced by setup.py build) will contain a much shorter file -# that just contains the computed version number. - -# This file is released into the public domain. Generated by -# versioneer-0.12 (https://github.com/warner/python-versioneer) - -# these strings will be replaced by git during git-archive -git_refnames = "$Format:%d$" -git_full = "$Format:%H$" - -# these strings are filled in when 'setup.py versioneer' creates _version.py -tag_prefix = "v" -parentdir_prefix = "frontera-" -versionfile_source = "frontera/_version.py" - -import os, sys, re, subprocess, errno - -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): - assert isinstance(commands, list) - p = None - for c in commands: - try: - # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) - break - except EnvironmentError: - e = sys.exc_info()[1] - if e.errno == errno.ENOENT: - continue - if verbose: - print("unable to run %s" % args[0]) - print(e) - return None - else: - if verbose: - print("unable to find command, tried %s" % (commands,)) - return None - stdout = p.communicate()[0].strip() - if sys.version >= '3': - stdout = stdout.decode() - if p.returncode != 0: - if verbose: - print("unable to run %s (error)" % args[0]) - return None - return stdout - - -def versions_from_parentdir(parentdir_prefix, root, verbose=False): - # Source tarballs conventionally unpack into a directory that includes - # both the project name and a version string. - dirname = os.path.basename(root) - if not dirname.startswith(parentdir_prefix): - if verbose: - print("guessing rootdir is '%s', but '%s' doesn't start with prefix '%s'" % - (root, dirname, parentdir_prefix)) - return None - return {"version": dirname[len(parentdir_prefix):], "full": ""} - -def git_get_keywords(versionfile_abs): - # the code embedded in _version.py can just fetch the value of these - # keywords. When used from setup.py, we don't want to import _version.py, - # so we do it with a regexp instead. This function is not used from - # _version.py. - keywords = {} - try: - f = open(versionfile_abs,"r") - for line in f.readlines(): - if line.strip().startswith("git_refnames ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["refnames"] = mo.group(1) - if line.strip().startswith("git_full ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["full"] = mo.group(1) - f.close() - except EnvironmentError: - pass - return keywords - -def git_versions_from_keywords(keywords, tag_prefix, verbose=False): - if not keywords: - return {} # keyword-finding function failed to find keywords - refnames = keywords["refnames"].strip() - if refnames.startswith("$Format"): - if verbose: - print("keywords are unexpanded, not using") - return {} # unexpanded, so not in an unpacked git-archive tarball - refs = set([r.strip() for r in refnames.strip("()").split(",")]) - # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of - # just "foo-1.0". If we see a "tag: " prefix, prefer those. - TAG = "tag: " - tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) - if not tags: - # Either we're using git < 1.8.3, or there really are no tags. We use - # a heuristic: assume all version tags have a digit. The old git %d - # expansion behaves like git log --decorate=short and strips out the - # refs/heads/ and refs/tags/ prefixes that would let us distinguish - # between branches and tags. By ignoring refnames without digits, we - # filter out many common branch names like "release" and - # "stabilization", as well as "HEAD" and "master". - tags = set([r for r in refs if re.search(r'\d', r)]) - if verbose: - print("discarding '%s', no digits" % ",".join(refs-tags)) - if verbose: - print("likely tags: %s" % ",".join(sorted(tags))) - for ref in sorted(tags): - # sorting will prefer e.g. "2.0" over "2.0rc1" - if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] - if verbose: - print("picking %s" % r) - return { "version": r, - "full": keywords["full"].strip() } - # no suitable tags, so we use the full revision id - if verbose: - print("no suitable tags, using full revision id") - return { "version": keywords["full"].strip(), - "full": keywords["full"].strip() } - - -def git_versions_from_vcs(tag_prefix, root, verbose=False): - # this runs 'git' from the root of the source tree. This only gets called - # if the git-archive 'subst' keywords were *not* expanded, and - # _version.py hasn't already been rewritten with a short version string, - # meaning we're inside a checked out source tree. - - if not os.path.exists(os.path.join(root, ".git")): - if verbose: - print("no .git in %s" % root) - return {} - - GITS = ["git"] - if sys.platform == "win32": - GITS = ["git.cmd", "git.exe"] - stdout = run_command(GITS, ["describe", "--tags", "--dirty", "--always"], - cwd=root) - if stdout is None: - return {} - if not stdout.startswith(tag_prefix): - if verbose: - print("tag '%s' doesn't start with prefix '%s'" % (stdout, tag_prefix)) - return {} - tag = stdout[len(tag_prefix):] - stdout = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) - if stdout is None: - return {} - full = stdout.strip() - if tag.endswith("-dirty"): - full += "-dirty" - return {"version": tag, "full": full} - - -def get_versions(default={"version": "unknown", "full": ""}, verbose=False): - # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have - # __file__, we can work backwards from there to the root. Some - # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which - # case we can only use expanded keywords. - - keywords = { "refnames": git_refnames, "full": git_full } - ver = git_versions_from_keywords(keywords, tag_prefix, verbose) - if ver: - return rep_by_pep440(ver) - - try: - root = os.path.abspath(__file__) - # versionfile_source is the relative path from the top of the source - # tree (where the .git directory might live) to this file. Invert - # this to find the root from __file__. - for i in range(len(versionfile_source.split('/'))): - root = os.path.dirname(root) - except NameError: - return default - - return rep_by_pep440( - git_versions_from_vcs(tag_prefix, root, verbose) - or versions_from_parentdir(parentdir_prefix, root, verbose) - or default) - - -def git2pep440(ver_str): - dash_count = ver_str.count('-') - if dash_count == 0: - return ver_str - elif dash_count == 1: - return ver_str.split('-')[0] + ".post.dev1.pre" - elif dash_count == 2: - tag, commits, _ = ver_str.split('-') - return ".post.dev".join([tag, commits]) - elif dash_count == 3: - tag, commits, _, _ = ver_str.split('-') - commits = str(int(commits) + 1) - return ".post.dev".join([tag, commits]) + ".pre" - else: - raise RuntimeError("Invalid version string") - - -def rep_by_pep440(ver): - if ver["full"]: # only if versions_from_parentdir was not used - ver["version"] = git2pep440(ver["version"]) - else: - ver["version"] = ver["version"].split('-')[0] - return ver diff --git a/frontera/contrib/backends/__init__.py b/frontera/contrib/backends/__init__.py index 2dc89a1ee..15dab1d9e 100644 --- a/frontera/contrib/backends/__init__.py +++ b/frontera/contrib/backends/__init__.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import from collections import OrderedDict from frontera import Backend @@ -10,7 +8,8 @@ class CommonBackend(Backend): """ A simpliest possible backend, performing one-time crawl: if page was crawled once, it will not be crawled again. """ - component_name = 'Common Backend' + + component_name = "Common Backend" @classmethod def from_manager(cls, manager): @@ -29,9 +28,9 @@ def frontier_stop(self): def add_seeds(self, seeds): for seed in seeds: - seed.meta[b'depth'] = 0 + seed.meta[b"depth"] = 0 self.metadata.add_seeds(seeds) - self.states.fetch([seed.meta[b'fingerprint'] for seed in seeds]) + self.states.fetch([seed.meta[b"fingerprint"] for seed in seeds]) self.states.set_states(seeds) self._schedule(seeds) self.states.update_cache(seeds) @@ -40,36 +39,51 @@ def _schedule(self, requests): batch = [] queue_incr = 0 for request in requests: - schedule = True if request.meta[b'state'] in [States.NOT_CRAWLED, States.ERROR, None] else False - batch.append((request.meta[b'fingerprint'], self._get_score(request), request, schedule)) + schedule = request.meta[b"state"] in [ + States.NOT_CRAWLED, + States.ERROR, + None, + ] + batch.append( + ( + request.meta[b"fingerprint"], + self._get_score(request), + request, + schedule, + ) + ) if schedule: queue_incr += 1 - request.meta[b'state'] = States.QUEUED + request.meta[b"state"] = States.QUEUED self.queue.schedule(batch) self.metadata.update_score(batch) self.queue_size += queue_incr def _get_score(self, obj): - return obj.meta.get(b'score', 1.0) + return obj.meta.get(b"score", 1.0) def get_next_requests(self, max_next_requests, **kwargs): - partitions = kwargs.pop('partitions', [0]) # TODO: Collect from all known partitions + partitions = kwargs.pop( + "partitions", [0] + ) # TODO: Collect from all known partitions batch = [] for partition_id in partitions: - batch.extend(self.queue.get_next_requests(max_next_requests, partition_id, **kwargs)) + batch.extend( + self.queue.get_next_requests(max_next_requests, partition_id, **kwargs) + ) self.queue_size -= len(batch) return batch def page_crawled(self, response): - response.meta[b'state'] = States.CRAWLED + response.meta[b"state"] = States.CRAWLED self.states.update_cache(response) self.metadata.page_crawled(response) def links_extracted(self, request, links): to_fetch = OrderedDict() for link in links: - to_fetch[link.meta[b'fingerprint']] = link - link.meta[b'depth'] = request.meta.get(b'depth', 0)+1 + to_fetch[link.meta[b"fingerprint"]] = link + link.meta[b"depth"] = request.meta.get(b"depth", 0) + 1 self.states.fetch(to_fetch.keys()) self.states.set_states(links) unique_links = to_fetch.values() @@ -78,7 +92,7 @@ def links_extracted(self, request, links): self.states.update_cache(unique_links) def request_error(self, request, error): - request.meta[b'state'] = States.ERROR + request.meta[b"state"] = States.ERROR self.metadata.request_error(request, error) self.states.update_cache(request) diff --git a/frontera/contrib/backends/hbase.py b/frontera/contrib/backends/hbase.py index 8f60cb6d3..74baa7abe 100644 --- a/frontera/contrib/backends/hbase.py +++ b/frontera/contrib/backends/hbase.py @@ -1,40 +1,35 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from frontera.utils.url import parse_domain_from_url_fast -from frontera import DistributedBackend -from frontera.core.components import Metadata, Queue, States -from frontera.core.models import Request -from frontera.contrib.backends.partitioners import Crc32NamePartitioner -from frontera.utils.misc import chunks, get_crc32 -from frontera.contrib.backends.remote.codecs.msgpack import Decoder, Encoder - -from happybase import Connection -from msgpack import Unpacker, Packer -import six -from six.moves import range -from w3lib.util import to_bytes - -from struct import pack, unpack -from datetime import datetime -from calendar import timegm -from time import time +import logging from binascii import hexlify, unhexlify +from calendar import timegm +from collections.abc import Iterable +from datetime import datetime from io import BytesIO from random import choice -from collections import Iterable -import logging +from struct import pack, unpack +from time import time + +from happybase import Connection +from msgpack import Packer, Unpacker +from w3lib.util import to_bytes +from frontera import DistributedBackend +from frontera.contrib.backends.partitioners import Crc32NamePartitioner +from frontera.contrib.backends.remote.codecs.msgpack import Decoder, Encoder +from frontera.core.components import Metadata, Queue, States +from frontera.core.models import Request +from frontera.utils.misc import chunks, get_crc32 +from frontera.utils.url import parse_domain_from_url_fast _pack_functions = { - 'url': to_bytes, - 'depth': lambda x: pack('>I', 0), - 'created_at': lambda x: pack('>Q', x), - 'status_code': lambda x: pack('>H', x), - 'state': lambda x: pack('>B', x), - 'error': to_bytes, - 'domain_fingerprint': to_bytes, - 'score': lambda x: pack('>f', x), - 'content': to_bytes + "url": to_bytes, + "depth": lambda x: pack(">I", 0), + "created_at": lambda x: pack(">Q", x), + "status_code": lambda x: pack(">H", x), + "state": lambda x: pack(">B", x), + "error": to_bytes, + "domain_fingerprint": to_bytes, + "score": lambda x: pack(">f", x), + "content": to_bytes, } @@ -44,16 +39,16 @@ def unpack_score(blob): def prepare_hbase_object(obj=None, **kwargs): if not obj: - obj = dict() - for k, v in six.iteritems(kwargs): - if k in ['score', 'state']: - cf = 's' - elif k == 'content': - cf = 'c' + obj = {} + for k, v in kwargs.items(): + if k in ["score", "state"]: + cf = "s" + elif k == "content": + cf = "c" else: - cf = 'm' + cf = "m" func = _pack_functions[k] - obj[cf + ':' + k] = func(v) + obj[cf + ":" + k] = func(v) return obj @@ -63,12 +58,11 @@ def utcnow_timestamp(): class HBaseQueue(Queue): - GET_RETRIES = 3 def __init__(self, connection, partitions, table_name, drop=False): self.connection = connection - self.partitions = [i for i in range(0, partitions)] + self.partitions = list(range(partitions)) self.partitioner = Crc32NamePartitioner(self.partitions) self.logger = logging.getLogger("hbase.queue") self.table_name = to_bytes(table_name) @@ -79,10 +73,13 @@ def __init__(self, connection, partitions, table_name, drop=False): tables.remove(self.table_name) if self.table_name not in tables: - self.connection.create_table(self.table_name, {'f': {'max_versions': 1, 'block_cache_enabled': 1}}) + self.connection.create_table( + self.table_name, {"f": {"max_versions": 1, "block_cache_enabled": 1}} + ) class DumbResponse: pass + self.decoder = Decoder(Request, DumbResponse) self.encoder = Encoder(Request) @@ -93,20 +90,26 @@ def frontier_stop(self): pass def schedule(self, batch): - to_schedule = dict() + to_schedule = {} now = int(time()) for fprint, score, request, schedule in batch: if schedule: - if b'domain' not in request.meta: # TODO: this have to be done always by DomainMiddleware, + if ( + b"domain" not in request.meta + ): # TODO: this have to be done always by DomainMiddleware, # so I propose to require DomainMiddleware by HBaseBackend and remove that code _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) if not hostname: - self.logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint) - request.meta[b'domain'] = {'name': hostname} - timestamp = request.meta[b'crawl_at'] if b'crawl_at' in request.meta else now + self.logger.error( + "Can't get hostname for URL %s, fingerprint %s", + request.url, + fprint, + ) + request.meta[b"domain"] = {"name": hostname} + timestamp = request.meta.get(b"crawl_at", now) to_schedule.setdefault(timestamp, []).append((request, score)) - for timestamp, batch in six.iteritems(to_schedule): - self._schedule(batch, timestamp) + for timestamp, batch_ in to_schedule.items(): + self._schedule(batch_, timestamp) def _schedule(self, batch, timestamp): """ @@ -127,6 +130,7 @@ def _schedule(self, batch, timestamp): :param batch: iterable of Request objects :return: """ + def get_interval(score, resolution): if score < 0.0 or score > 1.0: raise OverflowError @@ -136,40 +140,50 @@ def get_interval(score, resolution): i = i - 1 # last interval is inclusive from right return (i * resolution, (i + 1) * resolution) - random_str = int(time() * 1E+6) - data = dict() + random_str = int(time() * 1e6) + data = {} for request, score in batch: - domain = request.meta[b'domain'] - fingerprint = request.meta[b'fingerprint'] - if type(domain) == dict: - partition_id = self.partitioner.partition(domain[b'name'], self.partitions) - host_crc32 = get_crc32(domain[b'name']) - elif type(domain) == int: - partition_id = self.partitioner.partition_by_hash(domain, self.partitions) + domain = request.meta[b"domain"] + fingerprint = request.meta[b"fingerprint"] + if type(domain) is dict: + partition_id = self.partitioner.partition( + domain[b"name"], self.partitions + ) + host_crc32 = get_crc32(domain[b"name"]) + elif type(domain) is int: + partition_id = self.partitioner.partition_by_hash( + domain, self.partitions + ) host_crc32 = domain else: raise TypeError("domain of unknown type.") - item = (unhexlify(fingerprint), host_crc32, self.encoder.encode_request(request), score) - score = 1 - score # because of lexicographical sort in HBase - rk = "%d_%s_%d" % (partition_id, "%0.2f_%0.2f" % get_interval(score, 0.01), random_str) - data.setdefault(rk, []).append((score, item)) + item = ( + unhexlify(fingerprint), + host_crc32, + self.encoder.encode_request(request), + score, + ) + hbase_score = 1 - score # because of lexicographical sort in HBase + low, high = get_interval(hbase_score, 0.01) + rk = f"{partition_id}_{low:0.2f}_{high:0.2f}_{random_str}" + data.setdefault(rk, []).append((hbase_score, item)) table = self.connection.table(self.table_name) with table.batch(transaction=True) as b: - for rk, tuples in six.iteritems(data): - obj = dict() + for rk, tuples in data.items(): + obj = {} for score, item in tuples: - column = 'f:%0.3f_%0.3f' % get_interval(score, 0.001) + column = "f:{:0.3f}_{:0.3f}".format(*get_interval(score, 0.001)) obj.setdefault(column, []).append(item) - final = dict() + final = {} packer = Packer() - for column, items in six.iteritems(obj): + for column, items in obj.items(): stream = BytesIO() for item in items: stream.write(packer.pack(item)) final[column] = stream.getvalue() - final[b'f:t'] = str(timestamp) + final[b"f:t"] = str(timestamp) b.put(rk, final) def get_next_requests(self, max_n_requests, partition_id, **kwargs): @@ -184,10 +198,10 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): :param max_requests_per_host: maximum number of requests per host :return: list of :class:`Request ` objects. """ - min_requests = kwargs.pop('min_requests') - min_hosts = kwargs.pop('min_hosts') - max_requests_per_host = kwargs.pop('max_requests_per_host') - assert(max_n_requests > min_requests) + min_requests = kwargs.pop("min_requests") + min_hosts = kwargs.pop("min_hosts") + max_requests_per_host = kwargs.pop("max_requests_per_host") + assert max_n_requests > min_requests table = self.connection.table(self.table_name) meta_map = {} @@ -195,20 +209,25 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): limit = min_requests tries = 0 count = 0 - prefix = '%d_' % partition_id + prefix = f"{partition_id}_" now_ts = int(time()) - filter = "PrefixFilter ('%s') AND SingleColumnValueFilter ('f', 't', <=, 'binary:%d')" % (prefix, now_ts) + filter = f"PrefixFilter ('{prefix}') AND SingleColumnValueFilter ('f', 't', <=, 'binary:{now_ts}')" while tries < self.GET_RETRIES: tries += 1 limit *= 5.5 if tries > 1 else 1.0 - self.logger.debug("Try %d, limit %d, last attempt: requests %d, hosts %d", - tries, limit, count, len(queue.keys())) + self.logger.debug( + "Try %d, limit %d, last attempt: requests %d, hosts %d", + tries, + limit, + count, + len(queue.keys()), + ) meta_map.clear() queue.clear() count = 0 for rk, data in table.scan(limit=int(limit), batch_size=256, filter=filter): - for cq, buf in six.iteritems(data): - if cq == b'f:t': + for cq, buf in data.items(): + if cq == b"f:t": continue stream = BytesIO(buf) unpacker = Unpacker(stream) @@ -216,7 +235,10 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): fprint, host_crc32, _, _ = item if host_crc32 not in queue: queue[host_crc32] = [] - if max_requests_per_host is not None and len(queue[host_crc32]) > max_requests_per_host: + if ( + max_requests_per_host is not None + and len(queue[host_crc32]) > max_requests_per_host + ): continue queue[host_crc32].append(fprint) count += 1 @@ -234,18 +256,20 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): continue break - self.logger.debug("Finished: tries %d, hosts %d, requests %d", tries, len(queue.keys()), count) + self.logger.debug( + "Finished: tries %d, hosts %d, requests %d", tries, len(queue.keys()), count + ) # For every fingerprint collect it's row keys and return all fingerprints from them fprint_map = {} - for fprint, meta_list in six.iteritems(meta_map): + for fprint, meta_list in meta_map.items(): for rk, _ in meta_list: fprint_map.setdefault(rk, []).append(fprint) results = [] trash_can = set() - for _, fprints in six.iteritems(queue): + for fprints in queue.values(): for fprint in fprints: for rk, _ in meta_map[fprint]: if rk in trash_can: @@ -254,7 +278,7 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): _, item = meta_map[rk_fprint][0] _, _, encoded, score = item request = self.decoder.decode_request(encoded) - request.meta[b'score'] = score + request.meta[b"score"] = score results.append(request) trash_can.add(rk) @@ -269,7 +293,6 @@ def count(self): class HBaseState(States): - def __init__(self, connection, table_name, cache_size_limit): self.connection = connection self._table_name = table_name @@ -281,15 +304,17 @@ def update_cache(self, objs): objs = objs if isinstance(objs, Iterable) else [objs] def put(obj): - self._state_cache[obj.meta[b'fingerprint']] = obj.meta[b'state'] + self._state_cache[obj.meta[b"fingerprint"]] = obj.meta[b"state"] + [put(obj) for obj in objs] def set_states(self, objs): objs = objs if isinstance(objs, Iterable) else [objs] def get(obj): - fprint = obj.meta[b'fingerprint'] - obj.meta[b'state'] = self._state_cache[fprint] if fprint in self._state_cache else States.DEFAULT + fprint = obj.meta[b"fingerprint"] + obj.meta[b"state"] = self._state_cache.get(fprint, States.DEFAULT) + [get(obj) for obj in objs] def flush(self, force_clear): @@ -302,25 +327,33 @@ def flush(self, force_clear): hb_obj = prepare_hbase_object(state=state) b.put(unhexlify(fprint), hb_obj) if force_clear: - self.logger.debug("Cache has %d requests, clearing" % len(self._state_cache)) + self.logger.debug(f"Cache has {len(self._state_cache)} requests, clearing") self._state_cache.clear() def fetch(self, fingerprints): to_fetch = [f for f in fingerprints if f not in self._state_cache] - self.logger.debug("cache size %s" % len(self._state_cache)) - self.logger.debug("to fetch %d from %d" % (len(to_fetch), len(fingerprints))) + self.logger.debug(f"cache size {len(self._state_cache)}") + self.logger.debug(f"to fetch {len(to_fetch)} from {len(fingerprints)}") for chunk in chunks(to_fetch, 65536): keys = [unhexlify(fprint) for fprint in chunk] table = self.connection.table(self._table_name) - records = table.rows(keys, columns=[b's:state']) + records = table.rows(keys, columns=[b"s:state"]) for key, cells in records: - if b's:state' in cells: - state = unpack('>B', cells[b's:state'])[0] + if b"s:state" in cells: + state = unpack(">B", cells[b"s:state"])[0] self._state_cache[hexlify(key)] = state class HBaseMetadata(Metadata): - def __init__(self, connection, table_name, drop_all_tables, use_snappy, batch_size, store_content): + def __init__( + self, + connection, + table_name, + drop_all_tables, + use_snappy, + batch_size, + store_content, + ): self._table_name = to_bytes(table_name) tables = set(connection.tables()) if drop_all_tables and self._table_name in tables: @@ -328,14 +361,19 @@ def __init__(self, connection, table_name, drop_all_tables, use_snappy, batch_si tables.remove(self._table_name) if self._table_name not in tables: - schema = {'m': {'max_versions': 1}, - 's': {'max_versions': 1, 'block_cache_enabled': 1, - 'bloom_filter_type': 'ROW', 'in_memory': True, }, - 'c': {'max_versions': 1} - } + schema = { + "m": {"max_versions": 1}, + "s": { + "max_versions": 1, + "block_cache_enabled": 1, + "bloom_filter_type": "ROW", + "in_memory": True, + }, + "c": {"max_versions": 1}, + } if use_snappy: - schema['m']['compression'] = 'SNAPPY' - schema['c']['compression'] = 'SNAPPY' + schema["m"]["compression"] = "SNAPPY" + schema["c"]["compression"] = "SNAPPY" connection.create_table(self._table_name, schema) table = connection.table(self._table_name) self.batch = table.batch(batch_size=batch_size) @@ -352,71 +390,85 @@ def flush(self): def add_seeds(self, seeds): for seed in seeds: - obj = prepare_hbase_object(url=seed.url, - depth=0, - created_at=utcnow_timestamp(), - domain_fingerprint=seed.meta[b'domain'][b'fingerprint']) - self.batch.put(unhexlify(seed.meta[b'fingerprint']), obj) + obj = prepare_hbase_object( + url=seed.url, + depth=0, + created_at=utcnow_timestamp(), + domain_fingerprint=seed.meta[b"domain"][b"fingerprint"], + ) + self.batch.put(unhexlify(seed.meta[b"fingerprint"]), obj) def page_crawled(self, response): - obj = prepare_hbase_object(status_code=response.status_code, content=response.body) if self.store_content else \ - prepare_hbase_object(status_code=response.status_code) - self.batch.put(unhexlify(response.meta[b'fingerprint']), obj) + obj = ( + prepare_hbase_object( + status_code=response.status_code, content=response.body + ) + if self.store_content + else prepare_hbase_object(status_code=response.status_code) + ) + self.batch.put(unhexlify(response.meta[b"fingerprint"]), obj) def links_extracted(self, request, links): - links_dict = dict() + links_dict = {} for link in links: - links_dict[unhexlify(link.meta[b'fingerprint'])] = (link, link.url, link.meta[b'domain']) - for link_fingerprint, (link, link_url, link_domain) in six.iteritems(links_dict): - obj = prepare_hbase_object(url=link_url, - created_at=utcnow_timestamp(), - domain_fingerprint=link_domain[b'fingerprint']) + links_dict[unhexlify(link.meta[b"fingerprint"])] = ( + link, + link.url, + link.meta[b"domain"], + ) + for link_fingerprint, (_, link_url, link_domain) in (links_dict).items(): + obj = prepare_hbase_object( + url=link_url, + created_at=utcnow_timestamp(), + domain_fingerprint=link_domain[b"fingerprint"], + ) self.batch.put(link_fingerprint, obj) def request_error(self, request, error): - obj = prepare_hbase_object(url=request.url, - created_at=utcnow_timestamp(), - error=error, - domain_fingerprint=request.meta[b'domain'][b'fingerprint']) - rk = unhexlify(request.meta[b'fingerprint']) + obj = prepare_hbase_object( + url=request.url, + created_at=utcnow_timestamp(), + error=error, + domain_fingerprint=request.meta[b"domain"][b"fingerprint"], + ) + rk = unhexlify(request.meta[b"fingerprint"]) self.batch.put(rk, obj) def update_score(self, batch): if not isinstance(batch, dict): - raise TypeError('batch should be dict with fingerprint as key, and float score as value') - for fprint, (score, url, schedule) in six.iteritems(batch): + raise TypeError( + "batch should be dict with fingerprint as key, and float score as value" + ) + for fprint, (score, _url, _schedule) in batch.items(): obj = prepare_hbase_object(score=score) rk = unhexlify(fprint) self.batch.put(rk, obj) class HBaseBackend(DistributedBackend): - component_name = 'HBase Backend' + component_name = "HBase Backend" def __init__(self, manager): self.manager = manager self.logger = logging.getLogger("hbase.backend") settings = manager.settings - port = settings.get('HBASE_THRIFT_PORT') - hosts = settings.get('HBASE_THRIFT_HOST') - namespace = settings.get('HBASE_NAMESPACE') - self._min_requests = settings.get('BC_MIN_REQUESTS') - self._min_hosts = settings.get('BC_MIN_HOSTS') - self._max_requests_per_host = settings.get('BC_MAX_REQUESTS_PER_HOST') - - self.queue_partitions = settings.get('SPIDER_FEED_PARTITIONS') - host = choice(hosts) if type(hosts) in [list, tuple] else hosts + port = settings.get("HBASE_THRIFT_PORT") + hosts = settings.get("HBASE_THRIFT_HOST") + namespace = settings.get("HBASE_NAMESPACE") + self._min_requests = settings.get("BC_MIN_REQUESTS") + self._min_hosts = settings.get("BC_MIN_HOSTS") + self._max_requests_per_host = settings.get("BC_MAX_REQUESTS_PER_HOST") + + self.queue_partitions = settings.get("SPIDER_FEED_PARTITIONS") + host = choice(hosts) if type(hosts) in [list, tuple] else hosts # noqa: S311 kwargs = { - 'host': host, - 'port': int(port), - 'table_prefix': namespace, - 'table_prefix_separator': ':' + "host": host, + "port": int(port), + "table_prefix": namespace, + "table_prefix_separator": ":", } - if settings.get('HBASE_USE_FRAMED_COMPACT'): - kwargs.update({ - 'protocol': 'compact', - 'transport': 'framed' - }) + if settings.get("HBASE_USE_FRAMED_COMPACT"): + kwargs.update({"protocol": "compact", "transport": "framed"}) self.connection = Connection(**kwargs) self._metadata = None self._queue = None @@ -426,20 +478,32 @@ def __init__(self, manager): def strategy_worker(cls, manager): o = cls(manager) settings = manager.settings - o._states = HBaseState(o.connection, settings.get('HBASE_METADATA_TABLE'), - settings.get('HBASE_STATE_CACHE_SIZE_LIMIT')) + o._states = HBaseState( + o.connection, + settings.get("HBASE_METADATA_TABLE"), + settings.get("HBASE_STATE_CACHE_SIZE_LIMIT"), + ) return o @classmethod def db_worker(cls, manager): o = cls(manager) settings = manager.settings - drop_all_tables = settings.get('HBASE_DROP_ALL_TABLES') - o._queue = HBaseQueue(o.connection, o.queue_partitions, - settings.get('HBASE_QUEUE_TABLE'), drop=drop_all_tables) - o._metadata = HBaseMetadata(o.connection, settings.get('HBASE_METADATA_TABLE'), drop_all_tables, - settings.get('HBASE_USE_SNAPPY'), settings.get('HBASE_BATCH_SIZE'), - settings.get('STORE_CONTENT')) + drop_all_tables = settings.get("HBASE_DROP_ALL_TABLES") + o._queue = HBaseQueue( + o.connection, + o.queue_partitions, + settings.get("HBASE_QUEUE_TABLE"), + drop=drop_all_tables, + ) + o._metadata = HBaseMetadata( + o.connection, + settings.get("HBASE_METADATA_TABLE"), + drop_all_tables, + settings.get("HBASE_USE_SNAPPY"), + settings.get("HBASE_BATCH_SIZE"), + settings.get("STORE_CONTENT"), + ) return o @property @@ -483,14 +547,19 @@ def finished(self): def get_next_requests(self, max_next_requests, **kwargs): next_pages = [] self.logger.debug("Querying queue table.") - partitions = set(kwargs.pop('partitions', [])) - for partition_id in range(0, self.queue_partitions): + partitions = set(kwargs.pop("partitions", [])) + for partition_id in range(self.queue_partitions): if partition_id not in partitions: continue - results = self.queue.get_next_requests(max_next_requests, partition_id, - min_requests=self._min_requests, - min_hosts=self._min_hosts, - max_requests_per_host=self._max_requests_per_host) + results = self.queue.get_next_requests( + max_next_requests, + partition_id, + min_requests=self._min_requests, + min_hosts=self._min_hosts, + max_requests_per_host=self._max_requests_per_host, + ) next_pages.extend(results) - self.logger.debug("Got %d requests for partition id %d", len(results), partition_id) + self.logger.debug( + "Got %d requests for partition id %d", len(results), partition_id + ) return next_pages diff --git a/frontera/contrib/backends/memory/__init__.py b/frontera/contrib/backends/memory/__init__.py index e96cd29a8..3ab4ff465 100644 --- a/frontera/contrib/backends/memory/__init__.py +++ b/frontera/contrib/backends/memory/__init__.py @@ -1,17 +1,14 @@ -from __future__ import absolute_import import logging import random -from collections import deque, Iterable +from collections import deque +from collections.abc import Iterable from frontera.contrib.backends import CommonBackend -from frontera.core.components import Metadata, Queue, States +from frontera.contrib.backends.partitioners import Crc32NamePartitioner from frontera.core import OverusedBuffer +from frontera.core.components import Metadata, Queue, States from frontera.utils.heap import Heap -from frontera.contrib.backends.partitioners import Crc32NamePartitioner from frontera.utils.url import parse_domain_from_url_fast -import six -from six.moves import map -from six.moves import range def cmp(a, b): @@ -23,7 +20,7 @@ def __init__(self): self.requests = {} def request_error(self, request, error): - request.meta[b'error'] = error + request.meta[b"error"] = error self._get_or_create_request(request) def page_crawled(self, response): @@ -38,14 +35,13 @@ def add_seeds(self, seeds): self._get_or_create_request(seed) def _get_or_create_request(self, request): - fingerprint = request.meta[b'fingerprint'] + fingerprint = request.meta[b"fingerprint"] if fingerprint not in self.requests: new_request = request.copy() self.requests[fingerprint] = new_request return new_request, True - else: - page = self.requests[fingerprint] - return page, False + page = self.requests[fingerprint] + return page, False def update_score(self, batch): pass @@ -53,7 +49,7 @@ def update_score(self, batch): class MemoryQueue(Queue): def __init__(self, partitions): - self.partitions = [i for i in range(0, partitions)] + self.partitions = list(range(partitions)) self.partitioner = Crc32NamePartitioner(self.partitions) self.logger = logging.getLogger("memory.queue") self.heap = {} @@ -61,7 +57,7 @@ def __init__(self, partitions): self.heap[partition] = Heap(self._compare_pages) def count(self): - return sum([len(h.heap) for h in six.itervalues(self.heap)]) + return sum([len(h.heap) for h in self.heap.values()]) def get_next_requests(self, max_n_requests, partition_id, **kwargs): return self.heap[partition_id].pop(max_n_requests) @@ -69,17 +65,21 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): def schedule(self, batch): for fprint, score, request, schedule in batch: if schedule: - request.meta[b'_scr'] = score + request.meta[b"_scr"] = score _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) if not hostname: - self.logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint) + self.logger.error( + "Can't get hostname for URL %s, fingerprint %s", + request.url, + fprint, + ) partition_id = self.partitions[0] else: partition_id = self.partitioner.partition(hostname, self.partitions) self.heap[partition_id].push(request) def _compare_pages(self, first, second): - return cmp(first.meta[b'_scr'], second.meta[b'_scr']) + return cmp(first.meta[b"_scr"], second.meta[b"_scr"]) class MemoryDequeQueue(Queue): @@ -89,7 +89,7 @@ def __init__(self, partitions, is_fifo=True): :param partitions: int count of partitions :param type: bool, True for FIFO, False for LIFO """ - self.partitions = [i for i in range(0, partitions)] + self.partitions = list(range(partitions)) self.partitioner = Crc32NamePartitioner(self.partitions) self.logger = logging.getLogger("memory.dequequeue") self.queues = {} @@ -98,11 +98,15 @@ def __init__(self, partitions, is_fifo=True): self.queues[partition] = deque() def count(self): - return sum([len(h) for h in six.itervalues(self.queues)]) + return sum([len(h) for h in self.queues.values()]) def get_next_requests(self, max_n_requests, partition_id, **kwargs): batch = [] - pop_op = self.queues[partition_id].popleft if self.is_fifo else self.queues[partition_id].pop + pop_op = ( + self.queues[partition_id].popleft + if self.is_fifo + else self.queues[partition_id].pop + ) while max_n_requests > 0 and self.queues[partition_id]: batch.append(pop_op()) max_n_requests -= 1 @@ -111,10 +115,14 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): def schedule(self, batch): for fprint, score, request, schedule in batch: if schedule: - request.meta[b'_scr'] = score + request.meta[b"_scr"] = score _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) if not hostname: - self.logger.error("Can't get hostname for URL %s, fingerprint %s", request.url, fprint) + self.logger.error( + "Can't get hostname for URL %s, fingerprint %s", + request.url, + fprint, + ) partition_id = self.partitions[0] else: partition_id = self.partitioner.partition(hostname, self.partitions) @@ -122,18 +130,17 @@ def schedule(self, batch): class MemoryStates(States): - def __init__(self, cache_size_limit): - self._cache = dict() + self._cache = {} self._cache_size_limit = cache_size_limit self.logger = logging.getLogger("memory.states") def _put(self, obj): - self._cache[obj.meta[b'fingerprint']] = obj.meta[b'state'] + self._cache[obj.meta[b"fingerprint"]] = obj.meta[b"state"] def _get(self, obj): - fprint = obj.meta[b'fingerprint'] - obj.meta[b'state'] = self._cache[fprint] if fprint in self._cache else States.DEFAULT + fprint = obj.meta[b"fingerprint"] + obj.meta[b"state"] = self._cache.get(fprint, States.DEFAULT) def update_cache(self, objs): objs = objs if isinstance(objs, Iterable) else [objs] @@ -158,7 +165,8 @@ class MemoryBaseBackend(CommonBackend): """ Base class for in-memory heapq Backend objects. """ - component_name = 'Memory Base Backend' + + component_name = "Memory Base Backend" def __init__(self, manager): self.manager = manager @@ -189,15 +197,15 @@ def _create_queue(self, settings): def add_seeds(self, seeds): for seed in seeds: - seed.meta[b'id'] = self._id + seed.meta[b"id"] = self._id self._id += 1 - super(MemoryBaseBackend, self).add_seeds(seeds) + super().add_seeds(seeds) def links_extracted(self, request, links): for link in links: - link.meta[b'id'] = self._id + link.meta[b"id"] = self._id self._id += 1 - super(MemoryBaseBackend, self).links_extracted(request, links) + super().links_extracted(request, links) def finished(self): return self.queue.count() == 0 @@ -205,50 +213,54 @@ def finished(self): class MemoryDFSQueue(MemoryQueue): def _compare_pages(self, first, second): - return cmp((second.meta[b'depth'], first.meta[b'id']), - (first.meta[b'depth'], second.meta[b'id'])) + return cmp( + (second.meta[b"depth"], first.meta[b"id"]), + (first.meta[b"depth"], second.meta[b"id"]), + ) class MemoryBFSQueue(MemoryQueue): def _compare_pages(self, first, second): - return cmp((first.meta[b'depth'], first.meta[b'id']), - (second.meta[b'depth'], second.meta[b'id'])) + return cmp( + (first.meta[b"depth"], first.meta[b"id"]), + (second.meta[b"depth"], second.meta[b"id"]), + ) class MemoryRandomQueue(MemoryQueue): def _compare_pages(self, first, second): - return random.choice([-1, 0, 1]) + return random.choice([-1, 0, 1]) # noqa: S311 class MemoryFIFOBackend(MemoryBaseBackend): def _create_queue(self, settings): - return MemoryDequeQueue(settings.get('SPIDER_FEED_PARTITIONS')) + return MemoryDequeQueue(settings.get("SPIDER_FEED_PARTITIONS")) class MemoryLIFOBackend(MemoryBaseBackend): def _create_queue(self, settings): - return MemoryDequeQueue(settings.get('SPIDER_FEED_PARTITIONS'), is_fifo=False) + return MemoryDequeQueue(settings.get("SPIDER_FEED_PARTITIONS"), is_fifo=False) class MemoryDFSBackend(MemoryBaseBackend): def _create_queue(self, settings): - return MemoryDFSQueue(settings.get('SPIDER_FEED_PARTITIONS')) + return MemoryDFSQueue(settings.get("SPIDER_FEED_PARTITIONS")) class MemoryBFSBackend(MemoryBaseBackend): def _create_queue(self, settings): - return MemoryBFSQueue(settings.get('SPIDER_FEED_PARTITIONS')) + return MemoryBFSQueue(settings.get("SPIDER_FEED_PARTITIONS")) class MemoryRandomBackend(MemoryBaseBackend): def _create_queue(self, settings): - return MemoryRandomQueue(settings.get('SPIDER_FEED_PARTITIONS')) + return MemoryRandomQueue(settings.get("SPIDER_FEED_PARTITIONS")) class MemoryDFSOverusedBackend(MemoryDFSBackend): def __init__(self, manager): - super(MemoryDFSOverusedBackend, self).__init__(manager) - self.overused_buffer = OverusedBuffer(super(MemoryDFSOverusedBackend, self).get_next_requests) + super().__init__(manager) + self.overused_buffer = OverusedBuffer(super().get_next_requests) def get_next_requests(self, max_next_requests, **kwargs): return self.overused_buffer.get_next_requests(max_next_requests, **kwargs) diff --git a/frontera/contrib/backends/partitioners.py b/frontera/contrib/backends/partitioners.py index 5b425c20e..db8b66dc9 100644 --- a/frontera/contrib/backends/partitioners.py +++ b/frontera/contrib/backends/partitioners.py @@ -1,7 +1,5 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from struct import unpack from binascii import unhexlify +from struct import unpack from frontera.core.components import Partitioner from frontera.utils.misc import get_crc32 @@ -12,7 +10,9 @@ def partition(self, key, partitions=None): if key is None: return self.partitions[0] value = get_crc32(key) - return self.partition_by_hash(value, partitions if partitions else self.partitions) + return self.partition_by_hash( + value, partitions if partitions else self.partitions + ) def partition_by_hash(self, value, partitions): size = len(partitions) @@ -33,4 +33,4 @@ def partition(self, key, partitions=None): return partitions[idx] def __call__(self, key, all_partitions, available): - return self.partition(key, all_partitions) \ No newline at end of file + return self.partition(key, all_partitions) diff --git a/frontera/contrib/backends/remote/__init__.py b/frontera/contrib/backends/remote/__init__.py index 7c68785e9..e69de29bb 100644 --- a/frontera/contrib/backends/remote/__init__.py +++ b/frontera/contrib/backends/remote/__init__.py @@ -1 +0,0 @@ -# -*- coding: utf-8 -*- \ No newline at end of file diff --git a/frontera/contrib/backends/remote/codecs/__init__.py b/frontera/contrib/backends/remote/codecs/__init__.py index 7c68785e9..e69de29bb 100644 --- a/frontera/contrib/backends/remote/codecs/__init__.py +++ b/frontera/contrib/backends/remote/codecs/__init__.py @@ -1 +0,0 @@ -# -*- coding: utf-8 -*- \ No newline at end of file diff --git a/frontera/contrib/backends/remote/codecs/json.py b/frontera/contrib/backends/remote/codecs/json.py index ef4aa538d..668b1e35a 100644 --- a/frontera/contrib/backends/remote/codecs/json.py +++ b/frontera/contrib/backends/remote/codecs/json.py @@ -1,12 +1,11 @@ -# -*- coding: utf-8 -*- -""" A JSON codec for Frontera. Implemented using native json library. -""" -from __future__ import absolute_import +"""A JSON codec for Frontera. Implemented using native json library.""" + import json -import six from base64 import b64decode, b64encode + +from w3lib.util import to_bytes, to_unicode + from frontera.core.codec import BaseDecoder, BaseEncoder -from w3lib.util import to_unicode, to_bytes def _convert_and_save_type(obj): @@ -25,12 +24,15 @@ def _convert_and_save_type(obj): for the detailed explanation about the design. """ if isinstance(obj, bytes): - return 'bytes', to_unicode(obj) - elif isinstance(obj, dict): - return 'dict', [(_convert_and_save_type(k), _convert_and_save_type(v)) for k, v in six.iteritems(obj)] - elif isinstance(obj, (list, tuple)): + return "bytes", to_unicode(obj) + if isinstance(obj, dict): + return "dict", [ + (_convert_and_save_type(k), _convert_and_save_type(v)) + for k, v in obj.items() + ] + if isinstance(obj, (list, tuple)): return type(obj).__name__, [_convert_and_save_type(item) for item in obj] - return 'other', obj + return "other", obj def _convert_from_saved_type(obj): @@ -45,22 +47,27 @@ def _convert_from_saved_type(obj): """ assert len(obj) == 2 obj_type, obj_value = obj - if obj_type == 'bytes': + if obj_type == "bytes": return to_bytes(obj_value) - elif obj_type == 'dict': - return dict([(_convert_from_saved_type(k), _convert_from_saved_type(v)) for k, v in obj_value]) - elif obj_type in ['list', 'tuple']: - _type = list if obj_type == 'list' else tuple + if obj_type == "dict": + return { + _convert_from_saved_type(k): _convert_from_saved_type(v) + for k, v in obj_value + } + if obj_type in ["list", "tuple"]: + _type = list if obj_type == "list" else tuple return _type([_convert_from_saved_type(item) for item in obj_value]) return obj_value def _prepare_request_message(request): - return {'url': request.url, - 'method': request.method, - 'headers': request.headers, - 'cookies': request.cookies, - 'meta': request.meta} + return { + "url": request.url, + "method": request.method, + "headers": request.headers, + "cookies": request.cookies, + "meta": request.meta, + } def _prepare_links_message(links): @@ -68,134 +75,152 @@ def _prepare_links_message(links): def _prepare_response_message(response, send_body): - return {'url': response.url, - 'status_code': response.status_code, - 'meta': response.meta, - 'body': b64encode(response.body) if send_body else None} + return { + "url": response.url, + "status_code": response.status_code, + "meta": response.meta, + "body": b64encode(response.body) if send_body else None, + } class CrawlFrontierJSONEncoder(json.JSONEncoder): def __init__(self, request_model, *a, **kw): self._request_model = request_model - super(CrawlFrontierJSONEncoder, self).__init__(*a, **kw) + super().__init__(*a, **kw) def default(self, o): if isinstance(o, self._request_model): return _prepare_request_message(o) - else: - return super(CrawlFrontierJSONEncoder, self).default(o) + return super().default(o) class Encoder(BaseEncoder, CrawlFrontierJSONEncoder): def __init__(self, request_model, *a, **kw): - self.send_body = kw.pop('send_body', False) - super(Encoder, self).__init__(request_model, *a, **kw) + self.send_body = kw.pop("send_body", False) + super().__init__(request_model, *a, **kw) def encode(self, obj): encoded = _convert_and_save_type(obj) - return super(Encoder, self).encode(encoded) + return super().encode(encoded) def encode_add_seeds(self, seeds): - return self.encode({ - 'type': 'add_seeds', - 'seeds': [_prepare_request_message(seed) for seed in seeds] - }) + return self.encode( + { + "type": "add_seeds", + "seeds": [_prepare_request_message(seed) for seed in seeds], + } + ) def encode_page_crawled(self, response): - return self.encode({ - 'type': 'page_crawled', - 'r': _prepare_response_message(response, self.send_body) - }) + return self.encode( + { + "type": "page_crawled", + "r": _prepare_response_message(response, self.send_body), + } + ) def encode_links_extracted(self, request, links): - return self.encode({ - 'type': 'links_extracted', - 'r': _prepare_request_message(request), - 'links': _prepare_links_message(links) - }) + return self.encode( + { + "type": "links_extracted", + "r": _prepare_request_message(request), + "links": _prepare_links_message(links), + } + ) def encode_request_error(self, request, error): - return self.encode({ - 'type': 'request_error', - 'r': _prepare_request_message(request), - 'error': error - }) + return self.encode( + { + "type": "request_error", + "r": _prepare_request_message(request), + "error": error, + } + ) def encode_request(self, request): return self.encode(_prepare_request_message(request)) def encode_update_score(self, request, score, schedule): - return self.encode({'type': 'update_score', - 'r': _prepare_request_message(request), - 'score': score, - 'schedule': schedule}) + return self.encode( + { + "type": "update_score", + "r": _prepare_request_message(request), + "score": score, + "schedule": schedule, + } + ) def encode_new_job_id(self, job_id): - return self.encode({ - 'type': 'new_job_id', - 'job_id': int(job_id) - }) + return self.encode({"type": "new_job_id", "job_id": int(job_id)}) def encode_offset(self, partition_id, offset): - return self.encode({ - 'type': 'offset', - 'partition_id': int(partition_id), - 'offset': int(offset) - }) + return self.encode( + {"type": "offset", "partition_id": int(partition_id), "offset": int(offset)} + ) class Decoder(json.JSONDecoder, BaseDecoder): def __init__(self, request_model, response_model, *a, **kw): self._request_model = request_model self._response_model = response_model - super(Decoder, self).__init__(*a, **kw) + super().__init__(*a, **kw) def _response_from_object(self, obj): - url = obj['url'] - request = self._request_model(url=url, - meta=obj['meta']) - return self._response_model(url=url, - status_code=obj['status_code'], - body=b64decode(obj['body']) if obj['body'] is not None else None, - request=request) + url = obj["url"] + request = self._request_model(url=url, meta=obj["meta"]) + return self._response_model( + url=url, + status_code=obj["status_code"], + body=b64decode(obj["body"]) if obj["body"] is not None else None, + request=request, + ) def _request_from_object(self, obj): - return self._request_model(url=obj['url'], - method=obj['method'], - headers=obj['headers'], - cookies=obj['cookies'], - meta=obj['meta']) + return self._request_model( + url=obj["url"], + method=obj["method"], + headers=obj["headers"], + cookies=obj["cookies"], + meta=obj["meta"], + ) def decode(self, message): - message = _convert_from_saved_type(super(Decoder, self).decode(message)) - if message['type'] == 'links_extracted': - request = self._request_from_object(message['r']) - links = [self._request_from_object(link) for link in message['links']] - return ('links_extracted', request, links) - if message['type'] == 'page_crawled': - response = self._response_from_object(message['r']) - return ('page_crawled', response) - if message['type'] == 'request_error': - request = self._request_from_object(message['r']) - return ('request_error', request, message['error']) - if message['type'] == 'update_score': - return ('update_score', self._request_from_object(message['r']), message['score'], message['schedule']) - if message['type'] == 'add_seeds': + message = _convert_from_saved_type(super().decode(message)) + if message["type"] == "links_extracted": + request = self._request_from_object(message["r"]) + links = [self._request_from_object(link) for link in message["links"]] + return ("links_extracted", request, links) + if message["type"] == "page_crawled": + response = self._response_from_object(message["r"]) + return ("page_crawled", response) + if message["type"] == "request_error": + request = self._request_from_object(message["r"]) + return ("request_error", request, message["error"]) + if message["type"] == "update_score": + return ( + "update_score", + self._request_from_object(message["r"]), + message["score"], + message["schedule"], + ) + if message["type"] == "add_seeds": seeds = [] - for seed in message['seeds']: + for seed in message["seeds"]: request = self._request_from_object(seed) seeds.append(request) - return ('add_seeds', seeds) - if message['type'] == 'new_job_id': - return ('new_job_id', int(message['job_id'])) - if message['type'] == 'offset': - return ('offset', int(message['partition_id']), int(message['offset'])) - return TypeError('Unknown message type') + return ("add_seeds", seeds) + if message["type"] == "new_job_id": + return ("new_job_id", int(message["job_id"])) + if message["type"] == "offset": + return ("offset", int(message["partition_id"]), int(message["offset"])) + return TypeError("Unknown message type") def decode_request(self, message): - obj = _convert_from_saved_type(super(Decoder, self).decode(message)) - return self._request_model(url=obj['url'], - method=obj['method'], - headers=obj['headers'], - cookies=obj['cookies'], - meta=obj['meta']) + obj = _convert_from_saved_type(super().decode(message)) + return self._request_model( + url=obj["url"], + method=obj["method"], + headers=obj["headers"], + cookies=obj["cookies"], + meta=obj["meta"], + ) diff --git a/frontera/contrib/backends/remote/codecs/msgpack.py b/frontera/contrib/backends/remote/codecs/msgpack.py index 6be589dae..d35a76da8 100644 --- a/frontera/contrib/backends/remote/codecs/msgpack.py +++ b/frontera/contrib/backends/remote/codecs/msgpack.py @@ -1,14 +1,11 @@ -# -*- coding: utf-8 -*- -""" A MsgPack codec for Frontera. Implemented using native msgpack-python library. -""" -from __future__ import absolute_import +"""A MsgPack codec for Frontera. Implemented using native msgpack-python library.""" + import logging + from msgpack import packb, unpackb +from w3lib.util import to_unicode from frontera.core.codec import BaseDecoder, BaseEncoder -import six -from w3lib.util import to_native_str - logger = logging.getLogger(__name__) @@ -16,57 +13,85 @@ def _prepare_request_message(request): def serialize(obj): """Recursively walk object's hierarchy.""" - if isinstance(obj, (bool, six.integer_types, float, six.binary_type, six.text_type)): + if isinstance(obj, (bool, (int,), float, bytes, str)): return obj - elif isinstance(obj, dict): + if isinstance(obj, dict): obj = obj.copy() for key in obj: obj[key] = serialize(obj[key]) return obj - elif isinstance(obj, list): + if isinstance(obj, list): return [serialize(item) for item in obj] - elif isinstance(obj, tuple): - return tuple(serialize([item for item in obj])) - elif hasattr(obj, '__dict__'): + if isinstance(obj, tuple): + return tuple(serialize(list(obj))) + if hasattr(obj, "__dict__"): return serialize(obj.__dict__) - else: - logger.warning('unable to serialize object: {}'.format(obj)) - return None - return [request.url, request.method, request.headers, request.cookies, serialize(request.meta)] + logger.warning(f"unable to serialize object: {obj}") + return None + + return [ + request.url, + request.method, + request.headers, + request.cookies, + serialize(request.meta), + ] def _prepare_response_message(response, send_body): - return [response.url, response.status_code, response.meta, response.body if send_body else None] + return [ + response.url, + response.status_code, + response.meta, + response.body if send_body else None, + ] class Encoder(BaseEncoder): def __init__(self, request_model, *a, **kw): - self.send_body = True if 'send_body' in kw and kw['send_body'] else False + self.send_body = bool(kw.get("send_body")) def encode_add_seeds(self, seeds): - return packb([b'as', [_prepare_request_message(seed) for seed in seeds]], use_bin_type=True) + return packb( + [b"as", [_prepare_request_message(seed) for seed in seeds]], + use_bin_type=True, + ) def encode_page_crawled(self, response): - return packb([b'pc', _prepare_response_message(response, self.send_body)], use_bin_type=True) + return packb( + [b"pc", _prepare_response_message(response, self.send_body)], + use_bin_type=True, + ) def encode_links_extracted(self, request, links): - return packb([b'le', _prepare_request_message(request), [_prepare_request_message(link) for link in links]], - use_bin_type=True) + return packb( + [ + b"le", + _prepare_request_message(request), + [_prepare_request_message(link) for link in links], + ], + use_bin_type=True, + ) def encode_request_error(self, request, error): - return packb([b're', _prepare_request_message(request), str(error)], use_bin_type=True) + return packb( + [b"re", _prepare_request_message(request), str(error)], use_bin_type=True + ) def encode_request(self, request): return packb(_prepare_request_message(request), use_bin_type=True) def encode_update_score(self, request, score, schedule): - return packb([b'us', _prepare_request_message(request), score, schedule], use_bin_type=True) + return packb( + [b"us", _prepare_request_message(request), score, schedule], + use_bin_type=True, + ) def encode_new_job_id(self, job_id): - return packb([b'njid', int(job_id)], use_bin_type=True) + return packb([b"njid", int(job_id)], use_bin_type=True) def encode_offset(self, partition_id, offset): - return packb([b'of', int(partition_id), int(offset)], use_bin_type=True) + return packb([b"of", int(partition_id), int(offset)], use_bin_type=True) class Decoder(BaseDecoder): @@ -75,40 +100,48 @@ def __init__(self, request_model, response_model, *a, **kw): self._response_model = response_model def _response_from_object(self, obj): - url = to_native_str(obj[0]) - return self._response_model(url=url, - status_code=obj[1], - body=obj[3], - request=self._request_model(url=url, - meta=obj[2])) + url = to_unicode(obj[0]) + return self._response_model( + url=url, + status_code=obj[1], + body=obj[3], + request=self._request_model(url=url, meta=obj[2]), + ) def _request_from_object(self, obj): - return self._request_model(url=to_native_str(obj[0]), - method=obj[1], - headers=obj[2], - cookies=obj[3], - meta=obj[4]) + return self._request_model( + url=to_unicode(obj[0]), + method=obj[1], + headers=obj[2], + cookies=obj[3], + meta=obj[4], + ) def decode(self, buffer): - obj = unpackb(buffer, encoding='utf-8') - if obj[0] == b'pc': - return ('page_crawled', - self._response_from_object(obj[1])) - if obj[0] == b'le': - return ('links_extracted', - self._request_from_object(obj[1]), - [self._request_from_object(x) for x in obj[2]]) - if obj[0] == b'us': - return ('update_score', self._request_from_object(obj[1]), obj[2], obj[3]) - if obj[0] == b're': - return ('request_error', self._request_from_object(obj[1]), to_native_str(obj[2])) - if obj[0] == b'as': - return ('add_seeds', [self._request_from_object(x) for x in obj[1]]) - if obj[0] == b'njid': - return ('new_job_id', int(obj[1])) - if obj[0] == b'of': - return ('offset', int(obj[1]), int(obj[2])) - return TypeError('Unknown message type') + obj = unpackb(buffer, encoding="utf-8") + if obj[0] == b"pc": + return ("page_crawled", self._response_from_object(obj[1])) + if obj[0] == b"le": + return ( + "links_extracted", + self._request_from_object(obj[1]), + [self._request_from_object(x) for x in obj[2]], + ) + if obj[0] == b"us": + return ("update_score", self._request_from_object(obj[1]), obj[2], obj[3]) + if obj[0] == b"re": + return ( + "request_error", + self._request_from_object(obj[1]), + to_unicode(obj[2]), + ) + if obj[0] == b"as": + return ("add_seeds", [self._request_from_object(x) for x in obj[1]]) + if obj[0] == b"njid": + return ("new_job_id", int(obj[1])) + if obj[0] == b"of": + return ("offset", int(obj[1]), int(obj[2])) + return TypeError("Unknown message type") def decode_request(self, buffer): - return self._request_from_object(unpackb(buffer, encoding='utf-8')) + return self._request_from_object(unpackb(buffer, encoding="utf-8")) diff --git a/frontera/contrib/backends/remote/messagebus.py b/frontera/contrib/backends/remote/messagebus.py index f3827c22a..16c470401 100644 --- a/frontera/contrib/backends/remote/messagebus.py +++ b/frontera/contrib/backends/remote/messagebus.py @@ -1,33 +1,34 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import +import logging + from frontera import Backend from frontera.core import OverusedBuffer from frontera.utils.misc import load_object -import logging -import six class MessageBusBackend(Backend): def __init__(self, manager): settings = manager.settings - messagebus = load_object(settings.get('MESSAGE_BUS')) + messagebus = load_object(settings.get("MESSAGE_BUS")) self.mb = messagebus(settings) - codec_path = settings.get('MESSAGE_BUS_CODEC') - encoder_cls = load_object(codec_path+".Encoder") - decoder_cls = load_object(codec_path+".Decoder") - store_content = settings.get('STORE_CONTENT') + codec_path = settings.get("MESSAGE_BUS_CODEC") + encoder_cls = load_object(codec_path + ".Encoder") + decoder_cls = load_object(codec_path + ".Decoder") + store_content = settings.get("STORE_CONTENT") self._encoder = encoder_cls(manager.request_model, send_body=store_content) self._decoder = decoder_cls(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() - self.partition_id = int(settings.get('SPIDER_PARTITION_ID')) - if self.partition_id < 0 or self.partition_id >= settings.get('SPIDER_FEED_PARTITIONS'): - raise ValueError("Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS.") + self.partition_id = int(settings.get("SPIDER_PARTITION_ID")) + if self.partition_id < 0 or self.partition_id >= settings.get( + "SPIDER_FEED_PARTITIONS" + ): + raise ValueError( + "Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS." + ) self.consumer = spider_feed.consumer(partition_id=self.partition_id) - self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) + self._get_timeout = float(settings.get("KAFKA_GET_TIMEOUT")) self._logger = logging.getLogger("messagebus-backend") - self._buffer = OverusedBuffer(self._get_next_requests, - self._logger.debug) + self._buffer = OverusedBuffer(self._get_next_requests, self._logger.debug) self._logger.info("Consuming from partition id %d", self.partition_id) @classmethod @@ -42,36 +43,49 @@ def frontier_stop(self): def add_seeds(self, seeds): per_host = aggregate_per_host(seeds) - for host_fprint, host_links in six.iteritems(per_host): - self.spider_log_producer.send(host_fprint, - self._encoder.encode_add_seeds(host_links)) + for host_fprint, host_links in per_host.items(): + self.spider_log_producer.send( + host_fprint, self._encoder.encode_add_seeds(host_links) + ) def page_crawled(self, response): host_fprint = get_host_fprint(response) - self.spider_log_producer.send(host_fprint, self._encoder.encode_page_crawled(response)) + self.spider_log_producer.send( + host_fprint, self._encoder.encode_page_crawled(response) + ) def links_extracted(self, request, links): per_host = aggregate_per_host(links) - for host_fprint, host_links in six.iteritems(per_host): - self.spider_log_producer.send(host_fprint, - self._encoder.encode_links_extracted(request, host_links)) + for host_fprint, host_links in per_host.items(): + self.spider_log_producer.send( + host_fprint, self._encoder.encode_links_extracted(request, host_links) + ) def request_error(self, page, error): host_fprint = get_host_fprint(page) - self.spider_log_producer.send(host_fprint, self._encoder.encode_request_error(page, error)) + self.spider_log_producer.send( + host_fprint, self._encoder.encode_request_error(page, error) + ) def _get_next_requests(self, max_n_requests, **kwargs): requests = [] - for encoded in self.consumer.get_messages(count=max_n_requests, timeout=self._get_timeout): + for encoded in self.consumer.get_messages( + count=max_n_requests, timeout=self._get_timeout + ): try: request = self._decoder.decode_request(encoded) - except Exception as exc: - self._logger.warning("Could not decode message: {0}, error {1}".format(encoded, str(exc))) + except Exception as exc: # noqa: PERF203 + self._logger.warning( + f"Could not decode message: {encoded}, error {exc!s}" + ) else: requests.append(request) - self.spider_log_producer.send(b'0123456789abcdef0123456789abcdef012345678', - self._encoder.encode_offset(self.partition_id, - self.consumer.get_offset(self.partition_id))) + self.spider_log_producer.send( + b"0123456789abcdef0123456789abcdef012345678", + self._encoder.encode_offset( + self.partition_id, self.consumer.get_offset(self.partition_id) + ), + ) return requests def get_next_requests(self, max_n_requests, **kwargs): @@ -94,11 +108,11 @@ def states(self): def aggregate_per_host(requests): - per_host = dict() + per_host = {} for link in requests: - if b'fingerprint' not in link.meta[b'domain']: + if b"fingerprint" not in link.meta[b"domain"]: continue - host_fprint = link.meta[b'domain'][b'fingerprint'] + host_fprint = link.meta[b"domain"][b"fingerprint"] if host_fprint not in per_host: per_host[host_fprint] = [] per_host[host_fprint].append(link) @@ -106,6 +120,6 @@ def aggregate_per_host(requests): def get_host_fprint(request): - if b'fingerprint' not in request.meta[b'domain']: + if b"fingerprint" not in request.meta[b"domain"]: return None - return request.meta[b'domain'][b'fingerprint'] \ No newline at end of file + return request.meta[b"domain"][b"fingerprint"] diff --git a/frontera/contrib/backends/sqlalchemy/__init__.py b/frontera/contrib/backends/sqlalchemy/__init__.py index b8e7b8aa1..822df97ab 100644 --- a/frontera/contrib/backends/sqlalchemy/__init__.py +++ b/frontera/contrib/backends/sqlalchemy/__init__.py @@ -1,13 +1,11 @@ -from __future__ import absolute_import - from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker from sqlalchemy.engine.reflection import Inspector +from sqlalchemy.orm import sessionmaker -from frontera.core.components import DistributedBackend from frontera.contrib.backends import CommonBackend from frontera.contrib.backends.sqlalchemy.components import Metadata, Queue, States from frontera.contrib.backends.sqlalchemy.models import DeclarativeBase +from frontera.core.components import DistributedBackend from frontera.utils.misc import load_object @@ -15,14 +13,14 @@ class SQLAlchemyBackend(CommonBackend): def __init__(self, manager): self.manager = manager settings = manager.settings - engine = settings.get('SQLALCHEMYBACKEND_ENGINE') - engine_echo = settings.get('SQLALCHEMYBACKEND_ENGINE_ECHO') - drop_all_tables = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES') - clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT') - models = settings.get('SQLALCHEMYBACKEND_MODELS') + engine = settings.get("SQLALCHEMYBACKEND_ENGINE") + engine_echo = settings.get("SQLALCHEMYBACKEND_ENGINE_ECHO") + drop_all_tables = settings.get("SQLALCHEMYBACKEND_DROP_ALL_TABLES") + clear_content = settings.get("SQLALCHEMYBACKEND_CLEAR_CONTENT") + models = settings.get("SQLALCHEMYBACKEND_MODELS") self.engine = create_engine(engine, echo=engine_echo) - self.models = dict([(name, load_object(klass)) for name, klass in models.items()]) + self.models = {name: load_object(klass) for name, klass in models.items()} if drop_all_tables: DeclarativeBase.metadata.drop_all(self.engine) @@ -33,21 +31,31 @@ def __init__(self, manager): if clear_content: session = self.session_cls() - for name, table in DeclarativeBase.metadata.tables.items(): + for table in DeclarativeBase.metadata.tables.values(): session.execute(table.delete()) session.close() - self._metadata = Metadata(self.session_cls, self.models['MetadataModel'], - settings.get('SQLALCHEMYBACKEND_CACHE_SIZE')) - self._states = States(self.session_cls, self.models['StateModel'], - settings.get('STATE_CACHE_SIZE_LIMIT')) + self._metadata = Metadata( + self.session_cls, + self.models["MetadataModel"], + settings.get("SQLALCHEMYBACKEND_CACHE_SIZE"), + ) + self._states = States( + self.session_cls, + self.models["StateModel"], + settings.get("STATE_CACHE_SIZE_LIMIT"), + ) self._queue = self._create_queue(settings) def frontier_stop(self): - super(SQLAlchemyBackend, self).frontier_stop() + super().frontier_stop() self.engine.dispose() def _create_queue(self, settings): - return Queue(self.session_cls, self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS')) + return Queue( + self.session_cls, + self.models["QueueModel"], + settings.get("SPIDER_FEED_PARTITIONS"), + ) @property def queue(self): @@ -63,39 +71,55 @@ def states(self): class FIFOBackend(SQLAlchemyBackend): - component_name = 'SQLAlchemy FIFO Backend' + component_name = "SQLAlchemy FIFO Backend" def _create_queue(self, settings): - return Queue(self.session_cls, self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS'), - ordering='created') + return Queue( + self.session_cls, + self.models["QueueModel"], + settings.get("SPIDER_FEED_PARTITIONS"), + ordering="created", + ) class LIFOBackend(SQLAlchemyBackend): - component_name = 'SQLAlchemy LIFO Backend' + component_name = "SQLAlchemy LIFO Backend" def _create_queue(self, settings): - return Queue(self.session_cls, self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS'), - ordering='created_desc') + return Queue( + self.session_cls, + self.models["QueueModel"], + settings.get("SPIDER_FEED_PARTITIONS"), + ordering="created_desc", + ) class DFSBackend(SQLAlchemyBackend): - component_name = 'SQLAlchemy DFS Backend' + component_name = "SQLAlchemy DFS Backend" def _create_queue(self, settings): - return Queue(self.session_cls, self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS')) + return Queue( + self.session_cls, + self.models["QueueModel"], + settings.get("SPIDER_FEED_PARTITIONS"), + ) def _get_score(self, obj): - return -obj.meta[b'depth'] + return -obj.meta[b"depth"] class BFSBackend(SQLAlchemyBackend): - component_name = 'SQLAlchemy BFS Backend' + component_name = "SQLAlchemy BFS Backend" def _create_queue(self, settings): - return Queue(self.session_cls, self.models['QueueModel'], settings.get('SPIDER_FEED_PARTITIONS')) + return Queue( + self.session_cls, + self.models["QueueModel"], + settings.get("SPIDER_FEED_PARTITIONS"), + ) def _get_score(self, obj): - return obj.meta[b'depth'] + return obj.meta[b"depth"] BASE = CommonBackend @@ -109,11 +133,11 @@ class Distributed(DistributedBackend): def __init__(self, manager): self.manager = manager settings = manager.settings - engine = settings.get('SQLALCHEMYBACKEND_ENGINE') - engine_echo = settings.get('SQLALCHEMYBACKEND_ENGINE_ECHO') - models = settings.get('SQLALCHEMYBACKEND_MODELS') + engine = settings.get("SQLALCHEMYBACKEND_ENGINE") + engine_echo = settings.get("SQLALCHEMYBACKEND_ENGINE_ECHO") + models = settings.get("SQLALCHEMYBACKEND_MODELS") self.engine = create_engine(engine, echo=engine_echo) - self.models = dict([(name, load_object(klass)) for name, klass in models.items()]) + self.models = {name: load_object(klass) for name, klass in models.items()} self.session_cls = sessionmaker() self.session_cls.configure(bind=self.engine) self._metadata = None @@ -124,13 +148,16 @@ def __init__(self, manager): def strategy_worker(cls, manager): b = cls(manager) settings = manager.settings - drop_all_tables = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES') - clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT') - model = b.models['StateModel'] - inspector = Inspector.from_engine(b.engine) - + drop_all_tables = settings.get("SQLALCHEMYBACKEND_DROP_ALL_TABLES") + clear_content = settings.get("SQLALCHEMYBACKEND_CLEAR_CONTENT") + model = b.models["StateModel"] if drop_all_tables: - if model.__table__.name in inspector.get_table_names(): + inspector = Inspector.from_engine(b.engine) + try: + table_names = set(inspector.get_table_names()) + except RuntimeError: + table_names = set() + if model.__table__.name in table_names: model.__table__.drop(bind=b.engine) model.__table__.create(bind=b.engine) @@ -138,20 +165,19 @@ def strategy_worker(cls, manager): session = b.session_cls() session.execute(model.__table__.delete()) session.close() - b._states = States(b.session_cls, model, - settings.get('STATE_CACHE_SIZE_LIMIT')) + b._states = States(b.session_cls, model, settings.get("STATE_CACHE_SIZE_LIMIT")) return b @classmethod def db_worker(cls, manager): b = cls(manager) settings = manager.settings - drop = settings.get('SQLALCHEMYBACKEND_DROP_ALL_TABLES') - clear_content = settings.get('SQLALCHEMYBACKEND_CLEAR_CONTENT') + drop = settings.get("SQLALCHEMYBACKEND_DROP_ALL_TABLES") + clear_content = settings.get("SQLALCHEMYBACKEND_CLEAR_CONTENT") inspector = Inspector.from_engine(b.engine) - metadata_m = b.models['MetadataModel'] - queue_m = b.models['QueueModel'] + metadata_m = b.models["MetadataModel"] + queue_m = b.models["QueueModel"] if drop: existing = inspector.get_table_names() if metadata_m.__table__.name in existing: @@ -167,9 +193,10 @@ def db_worker(cls, manager): session.execute(queue_m.__table__.delete()) session.close() - b._metadata = Metadata(b.session_cls, metadata_m, - settings.get('SQLALCHEMYBACKEND_CACHE_SIZE')) - b._queue = Queue(b.session_cls, queue_m, settings.get('SPIDER_FEED_PARTITIONS')) + b._metadata = Metadata( + b.session_cls, metadata_m, settings.get("SQLALCHEMYBACKEND_CACHE_SIZE") + ) + b._queue = Queue(b.session_cls, queue_m, settings.get("SPIDER_FEED_PARTITIONS")) return b @property @@ -198,10 +225,14 @@ def add_seeds(self, seeds): self.metadata.add_seeds(seeds) def get_next_requests(self, max_next_requests, **kwargs): - partitions = kwargs.pop('partitions', [0]) # TODO: Collect from all known partitions + partitions = kwargs.pop( + "partitions", [0] + ) # TODO: Collect from all known partitions batch = [] for partition_id in partitions: - batch.extend(self.queue.get_next_requests(max_next_requests, partition_id, **kwargs)) + batch.extend( + self.queue.get_next_requests(max_next_requests, partition_id, **kwargs) + ) return batch def page_crawled(self, response): @@ -215,4 +246,3 @@ def request_error(self, request, error): def finished(self): raise NotImplementedError - diff --git a/frontera/contrib/backends/sqlalchemy/components.py b/frontera/contrib/backends/sqlalchemy/components.py index 8661ac576..6e8f70047 100644 --- a/frontera/contrib/backends/sqlalchemy/components.py +++ b/frontera/contrib/backends/sqlalchemy/components.py @@ -1,20 +1,18 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import import logging from datetime import datetime -from time import time, sleep +from time import sleep, time from cachetools import LRUCache -from frontera.contrib.backends.partitioners import Crc32NamePartitioner +from w3lib.util import to_bytes, to_unicode + from frontera.contrib.backends.memory import MemoryStates +from frontera.contrib.backends.partitioners import Crc32NamePartitioner from frontera.contrib.backends.sqlalchemy.models import DeclarativeBase -from frontera.core.components import Metadata as BaseMetadata, Queue as BaseQueue +from frontera.core.components import Metadata as BaseMetadata +from frontera.core.components import Queue as BaseQueue from frontera.core.models import Request, Response -from frontera.utils.misc import get_crc32, chunks +from frontera.utils.misc import chunks, get_crc32 from frontera.utils.url import parse_domain_from_url_fast -import six -from six.moves import range -from w3lib.util import to_native_str, to_bytes def retry_and_rollback(func): @@ -23,24 +21,26 @@ def func_wrapper(self, *args, **kwargs): while True: try: return func(self, *args, **kwargs) - except Exception as exc: + except Exception as exc: # noqa: PERF203 self.logger.exception(exc) self.session.rollback() sleep(5) tries -= 1 if tries > 0: - self.logger.info("Tries left %i" % tries) + self.logger.info(f"Tries left {tries}") continue - else: - raise exc + raise exc + return func_wrapper class Metadata(BaseMetadata): def __init__(self, session_cls, model_cls, cache_size): - self.session = session_cls(expire_on_commit=False) # FIXME: Should be explicitly mentioned in docs + self.session = session_cls( + expire_on_commit=False + ) # FIXME: Should be explicitly mentioned in docs self.model = model_cls - self.table = DeclarativeBase.metadata.tables['metadata'] + self.table = DeclarativeBase.metadata.tables["metadata"] self.cache = LRUCache(cache_size) self.logger = logging.getLogger("sqlalchemy.metadata") @@ -56,36 +56,46 @@ def add_seeds(self, seeds): @retry_and_rollback def request_error(self, page, error): - m = self._modify_page(page) if page.meta[b'fingerprint'] in self.cache else self._create_page(page) + m = ( + self._modify_page(page) + if page.meta[b"fingerprint"] in self.cache + else self._create_page(page) + ) m.error = error self.cache[to_bytes(m.fingerprint)] = self.session.merge(m) self.session.commit() @retry_and_rollback def page_crawled(self, response): - r = self._modify_page(response) if response.meta[b'fingerprint'] in self.cache else self._create_page(response) + r = ( + self._modify_page(response) + if response.meta[b"fingerprint"] in self.cache + else self._create_page(response) + ) self.cache[r.fingerprint] = self.session.merge(r) self.session.commit() def links_extracted(self, request, links): for link in links: - if link.meta[b'fingerprint'] not in self.cache: - self.cache[link.meta[b'fingerprint']] = self.session.merge(self._create_page(link)) + if link.meta[b"fingerprint"] not in self.cache: + self.cache[link.meta[b"fingerprint"]] = self.session.merge( + self._create_page(link) + ) self.session.commit() def _modify_page(self, obj): - db_page = self.cache[obj.meta[b'fingerprint']] + db_page = self.cache[obj.meta[b"fingerprint"]] db_page.fetched_at = datetime.utcnow() if isinstance(obj, Response): db_page.headers = obj.request.headers - db_page.method = to_native_str(obj.request.method) + db_page.method = to_unicode(obj.request.method) db_page.cookies = obj.request.cookies db_page.status_code = obj.status_code return db_page def _create_page(self, obj): db_page = self.model() - db_page.fingerprint = to_native_str(obj.meta[b'fingerprint']) + db_page.fingerprint = to_unicode(obj.meta[b"fingerprint"]) db_page.url = obj.url db_page.created_at = datetime.utcnow() db_page.meta = obj.meta @@ -93,30 +103,29 @@ def _create_page(self, obj): if isinstance(obj, Request): db_page.headers = obj.headers - db_page.method = to_native_str(obj.method) + db_page.method = to_unicode(obj.method) db_page.cookies = obj.cookies elif isinstance(obj, Response): db_page.headers = obj.request.headers - db_page.method = to_native_str(obj.request.method) + db_page.method = to_unicode(obj.request.method) db_page.cookies = obj.request.cookies db_page.status_code = obj.status_code return db_page @retry_and_rollback def update_score(self, batch): - for fprint, score, request, schedule in batch: - m = self.model(fingerprint=to_native_str(fprint), score=score) + for fprint, score, _request, _schedule in batch: + m = self.model(fingerprint=to_unicode(fprint), score=score) self.session.merge(m) self.session.commit() class States(MemoryStates): - def __init__(self, session_cls, model_cls, cache_size_limit): - super(States, self).__init__(cache_size_limit) + super().__init__(cache_size_limit) self.session = session_cls() self.model = model_cls - self.table = DeclarativeBase.metadata.tables['states'] + self.table = DeclarativeBase.metadata.tables["states"] self.logger = logging.getLogger("sqlalchemy.states") @retry_and_rollback @@ -126,30 +135,32 @@ def frontier_stop(self): @retry_and_rollback def fetch(self, fingerprints): - to_fetch = [to_native_str(f) for f in fingerprints if f not in self._cache] + to_fetch = [to_unicode(f) for f in fingerprints if f not in self._cache] self.logger.debug("cache size %s", len(self._cache)) self.logger.debug("to fetch %d from %d", len(to_fetch), len(fingerprints)) for chunk in chunks(to_fetch, 128): - for state in self.session.query(self.model).filter(self.model.fingerprint.in_(chunk)): + for state in self.session.query(self.model).filter( + self.model.fingerprint.in_(chunk) + ): self._cache[to_bytes(state.fingerprint)] = state.state @retry_and_rollback def flush(self, force_clear=False): - for fingerprint, state_val in six.iteritems(self._cache): - state = self.model(fingerprint=to_native_str(fingerprint), state=state_val) + for fingerprint, state_val in self._cache.items(): + state = self.model(fingerprint=to_unicode(fingerprint), state=state_val) self.session.merge(state) self.session.commit() self.logger.debug("State cache has been flushed.") - super(States, self).flush(force_clear) + super().flush(force_clear) class Queue(BaseQueue): - def __init__(self, session_cls, queue_cls, partitions, ordering='default'): + def __init__(self, session_cls, queue_cls, partitions, ordering="default"): self.session = session_cls() self.queue_model = queue_cls self.logger = logging.getLogger("sqlalchemy.queue") - self.partitions = [i for i in range(0, partitions)] + self.partitions = list(range(partitions)) self.partitioner = Crc32NamePartitioner(self.partitions) self.ordering = ordering @@ -157,11 +168,13 @@ def frontier_stop(self): self.session.close() def _order_by(self, query): - if self.ordering == 'created': + if self.ordering == "created": return query.order_by(self.queue_model.created_at) - if self.ordering == 'created_desc': + if self.ordering == "created_desc": return query.order_by(self.queue_model.created_at.desc()) - return query.order_by(self.queue_model.score, self.queue_model.created_at) # TODO: remove second parameter, + return query.order_by( + self.queue_model.score, self.queue_model.created_at + ) # TODO: remove second parameter, # it's not necessary for proper crawling, but needed for tests def get_next_requests(self, max_n_requests, partition_id, **kwargs): @@ -174,12 +187,21 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): """ results = [] try: - for item in self._order_by(self.session.query(self.queue_model).filter_by(partition_id=partition_id)).\ - limit(max_n_requests): - method = item.method or b'GET' - r = Request(item.url, method=method, meta=item.meta, headers=item.headers, cookies=item.cookies) - r.meta[b'fingerprint'] = to_bytes(item.fingerprint) - r.meta[b'score'] = item.score + for item in self._order_by( + self.session.query(self.queue_model).filter_by( + partition_id=partition_id + ) + ).limit(max_n_requests): + method = item.method or b"GET" + r = Request( + item.url, + method=method, + meta=item.meta, + headers=item.headers, + cookies=item.cookies, + ) + r.meta[b"fingerprint"] = to_bytes(item.fingerprint) + r.meta[b"score"] = item.score results.append(r) self.session.delete(item) self.session.commit() @@ -195,17 +217,28 @@ def schedule(self, batch): if schedule: _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) if not hostname: - self.logger.error("Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint)) + self.logger.error( + f"Can't get hostname for URL {request.url}, fingerprint {fprint}" + ) partition_id = self.partitions[0] host_crc32 = 0 else: partition_id = self.partitioner.partition(hostname, self.partitions) host_crc32 = get_crc32(hostname) - q = self.queue_model(fingerprint=to_native_str(fprint), score=score, url=request.url, meta=request.meta, - headers=request.headers, cookies=request.cookies, method=to_native_str(request.method), - partition_id=partition_id, host_crc32=host_crc32, created_at=time()*1E+6) + q = self.queue_model( + fingerprint=to_unicode(fprint), + score=score, + url=request.url, + meta=request.meta, + headers=request.headers, + cookies=request.cookies, + method=to_unicode(request.method), + partition_id=partition_id, + host_crc32=host_crc32, + created_at=time() * 1e6, + ) to_save.append(q) - request.meta[b'state'] = States.QUEUED + request.meta[b"state"] = States.QUEUED self.session.bulk_save_objects(to_save) self.session.commit() @@ -215,7 +248,6 @@ def count(self): class BroadCrawlingQueue(Queue): - GET_RETRIES = 3 @retry_and_rollback @@ -236,7 +268,7 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): min_requests = kwargs.pop("min_requests", None) min_hosts = kwargs.pop("min_hosts", None) max_requests_per_host = kwargs.pop("max_requests_per_host", None) - assert(max_n_requests > min_requests) + assert max_n_requests > min_requests queue = {} limit = max_n_requests @@ -245,15 +277,26 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): while tries < self.GET_RETRIES: tries += 1 limit *= 5.5 if tries > 1 else 1.0 - self.logger.debug("Try %d, limit %d, last attempt: requests %d, hosts %d", - tries, limit, count, len(queue.keys())) + self.logger.debug( + "Try %d, limit %d, last attempt: requests %d, hosts %d", + tries, + limit, + count, + len(queue.keys()), + ) queue.clear() count = 0 - for item in self._order_by(self.session.query(self.queue_model).filter_by(partition_id=partition_id)).\ - limit(limit): + for item in self._order_by( + self.session.query(self.queue_model).filter_by( + partition_id=partition_id + ) + ).limit(limit): if item.host_crc32 not in queue: queue[item.host_crc32] = [] - if max_requests_per_host is not None and len(queue[item.host_crc32]) > max_requests_per_host: + if ( + max_requests_per_host is not None + and len(queue[item.host_crc32]) > max_requests_per_host + ): continue queue[item.host_crc32].append(item) count += 1 @@ -264,14 +307,23 @@ def get_next_requests(self, max_n_requests, partition_id, **kwargs): if min_requests is not None and count < min_requests: continue break - self.logger.debug("Finished: tries %d, hosts %d, requests %d", tries, len(queue.keys()), count) + self.logger.debug( + "Finished: tries %d, hosts %d, requests %d", tries, len(queue.keys()), count + ) results = [] - for items in six.itervalues(queue): + for items in queue.values(): for item in items: - method = item.method or b'GET' - results.append(Request(item.url, method=method, - meta=item.meta, headers=item.headers, cookies=item.cookies)) + method = item.method or b"GET" + results.append( + Request( + item.url, + method=method, + meta=item.meta, + headers=item.headers, + cookies=item.cookies, + ) + ) self.session.delete(item) self.session.commit() return results diff --git a/frontera/contrib/backends/sqlalchemy/models.py b/frontera/contrib/backends/sqlalchemy/models.py index 8211d21c6..53c132b8c 100644 --- a/frontera/contrib/backends/sqlalchemy/models.py +++ b/frontera/contrib/backends/sqlalchemy/models.py @@ -1,18 +1,25 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from sqlalchemy import Column, String, Integer, PickleType, SmallInteger, Float, DateTime, BigInteger +from sqlalchemy import ( + BigInteger, + Column, + DateTime, + Float, + Integer, + PickleType, + SmallInteger, + String, +) from sqlalchemy.ext.declarative import declarative_base DeclarativeBase = declarative_base() class MetadataModel(DeclarativeBase): - __tablename__ = 'metadata' + __tablename__ = "metadata" __table_args__ = ( { - 'mysql_charset': 'utf8', - 'mysql_engine': 'InnoDB', - 'mysql_row_format': 'DYNAMIC', + "mysql_charset": "utf8", + "mysql_engine": "InnoDB", + "mysql_row_format": "DYNAMIC", }, ) @@ -34,16 +41,16 @@ def query(cls, session): return session.query(cls) def __repr__(self): - return '' % (self.url, self.fingerprint) + return f"" class StateModel(DeclarativeBase): - __tablename__ = 'states' + __tablename__ = "states" __table_args__ = ( { - 'mysql_charset': 'utf8', - 'mysql_engine': 'InnoDB', - 'mysql_row_format': 'DYNAMIC', + "mysql_charset": "utf8", + "mysql_engine": "InnoDB", + "mysql_row_format": "DYNAMIC", }, ) @@ -55,15 +62,15 @@ def query(cls, session): return session.query(cls) def __repr__(self): - return '' % (self.fingerprint, self.state) + return f"" -class QueueModelMixin(object): +class QueueModelMixin: __table_args__ = ( { - 'mysql_charset': 'utf8', - 'mysql_engine': 'InnoDB', - 'mysql_row_format': 'DYNAMIC', + "mysql_charset": "utf8", + "mysql_engine": "InnoDB", + "mysql_row_format": "DYNAMIC", }, ) @@ -82,11 +89,11 @@ class QueueModelMixin(object): class QueueModel(QueueModelMixin, DeclarativeBase): - __tablename__ = 'queue' + __tablename__ = "queue" @classmethod def query(cls, session): return session.query(cls) def __repr__(self): - return '' % (self.url, self.id) + return f"" diff --git a/frontera/contrib/backends/sqlalchemy/revisiting.py b/frontera/contrib/backends/sqlalchemy/revisiting.py index b2b574715..454b7e612 100644 --- a/frontera/contrib/backends/sqlalchemy/revisiting.py +++ b/frontera/contrib/backends/sqlalchemy/revisiting.py @@ -1,20 +1,18 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import import logging -from datetime import datetime, timedelta -from time import time, sleep from calendar import timegm +from datetime import datetime, timedelta +from time import sleep, time -from sqlalchemy import Column, BigInteger +from sqlalchemy import BigInteger, Column from frontera import Request from frontera.contrib.backends.partitioners import Crc32NamePartitioner from frontera.contrib.backends.sqlalchemy import SQLAlchemyBackend -from frontera.contrib.backends.sqlalchemy.models import QueueModelMixin, DeclarativeBase -from frontera.core.components import Queue as BaseQueue, States +from frontera.contrib.backends.sqlalchemy.models import DeclarativeBase, QueueModelMixin +from frontera.core.components import Queue as BaseQueue +from frontera.core.components import States from frontera.utils.misc import get_crc32 from frontera.utils.url import parse_domain_from_url_fast -from six.moves import range def utcnow_timestamp(): @@ -23,7 +21,7 @@ def utcnow_timestamp(): class RevisitingQueueModel(QueueModelMixin, DeclarativeBase): - __tablename__ = 'revisiting_queue' + __tablename__ = "revisiting_queue" crawl_at = Column(BigInteger, nullable=False) @@ -34,16 +32,16 @@ def func_wrapper(self, *args, **kwargs): while True: try: return func(self, *args, **kwargs) - except Exception as exc: + except Exception as exc: # noqa: PERF203 self.logger.exception(exc) self.session.rollback() sleep(5) tries -= 1 if tries > 0: - self.logger.info("Tries left %i" % tries) + self.logger.info(f"Tries left {tries}") continue - else: - raise exc + raise exc + return func_wrapper @@ -52,7 +50,7 @@ def __init__(self, session_cls, queue_cls, partitions): self.session = session_cls() self.queue_model = queue_cls self.logger = logging.getLogger("sqlalchemy.revisiting.queue") - self.partitions = [i for i in range(0, partitions)] + self.partitions = list(range(partitions)) self.partitioner = Crc32NamePartitioner(self.partitions) def frontier_stop(self): @@ -61,13 +59,24 @@ def frontier_stop(self): def get_next_requests(self, max_n_requests, partition_id, **kwargs): results = [] try: - for item in self.session.query(self.queue_model).\ - filter(RevisitingQueueModel.crawl_at <= utcnow_timestamp(), - RevisitingQueueModel.partition_id == partition_id).\ - limit(max_n_requests): - method = 'GET' if not item.method else item.method - results.append(Request(item.url, method=method, meta=item.meta, headers=item.headers, - cookies=item.cookies)) + for item in ( + self.session.query(self.queue_model) + .filter( + RevisitingQueueModel.crawl_at <= utcnow_timestamp(), + RevisitingQueueModel.partition_id == partition_id, + ) + .limit(max_n_requests) + ): + method = "GET" if not item.method else item.method + results.append( + Request( + item.url, + method=method, + meta=item.meta, + headers=item.headers, + cookies=item.cookies, + ) + ) self.session.delete(item) self.session.commit() except Exception as exc: @@ -82,19 +91,34 @@ def schedule(self, batch): if schedule: _, hostname, _, _, _, _ = parse_domain_from_url_fast(request.url) if not hostname: - self.logger.error("Can't get hostname for URL %s, fingerprint %s" % (request.url, fprint)) + self.logger.error( + f"Can't get hostname for URL {request.url}, fingerprint {fprint}" + ) partition_id = self.partitions[0] host_crc32 = 0 else: partition_id = self.partitioner.partition(hostname, self.partitions) host_crc32 = get_crc32(hostname) - schedule_at = request.meta[b'crawl_at'] if b'crawl_at' in request.meta else utcnow_timestamp() - q = self.queue_model(fingerprint=fprint, score=score, url=request.url, meta=request.meta, - headers=request.headers, cookies=request.cookies, method=request.method, - partition_id=partition_id, host_crc32=host_crc32, created_at=time()*1E+6, - crawl_at=schedule_at) + schedule_at = ( + request.meta[b"crawl_at"] + if b"crawl_at" in request.meta + else utcnow_timestamp() + ) + q = self.queue_model( + fingerprint=fprint, + score=score, + url=request.url, + meta=request.meta, + headers=request.headers, + cookies=request.cookies, + method=request.method, + partition_id=partition_id, + host_crc32=host_crc32, + created_at=time() * 1e6, + crawl_at=schedule_at, + ) to_save.append(q) - request.meta[b'state'] = States.QUEUED + request.meta[b"state"] = States.QUEUED self.session.bulk_save_objects(to_save) self.session.commit() @@ -104,29 +128,34 @@ def count(self): class Backend(SQLAlchemyBackend): - def _create_queue(self, settings): self.interval = settings.get("SQLALCHEMYBACKEND_REVISIT_INTERVAL") assert isinstance(self.interval, timedelta) self.interval = self.interval.total_seconds() - return RevisitingQueue(self.session_cls, RevisitingQueueModel, settings.get('SPIDER_FEED_PARTITIONS')) + return RevisitingQueue( + self.session_cls, + RevisitingQueueModel, + settings.get("SPIDER_FEED_PARTITIONS"), + ) def _schedule(self, requests): batch = [] for request in requests: - if request.meta[b'state'] in [States.NOT_CRAWLED]: - request.meta[b'crawl_at'] = utcnow_timestamp() - elif request.meta[b'state'] in [States.CRAWLED, States.ERROR]: - request.meta[b'crawl_at'] = utcnow_timestamp() + self.interval + if request.meta[b"state"] in [States.NOT_CRAWLED]: + request.meta[b"crawl_at"] = utcnow_timestamp() + elif request.meta[b"state"] in [States.CRAWLED, States.ERROR]: + request.meta[b"crawl_at"] = utcnow_timestamp() + self.interval else: - continue # QUEUED - batch.append((request.meta[b'fingerprint'], self._get_score(request), request, True)) + continue # QUEUED + batch.append( + (request.meta[b"fingerprint"], self._get_score(request), request, True) + ) self.queue.schedule(batch) self.metadata.update_score(batch) self.queue_size += len(batch) def page_crawled(self, response): - super(Backend, self).page_crawled(response) + super().page_crawled(response) self.states.set_states(response.request) self._schedule([response.request]) self.states.update_cache(response.request) diff --git a/frontera/contrib/canonicalsolvers/__init__.py b/frontera/contrib/canonicalsolvers/__init__.py index 4c40587a0..c5cffd672 100644 --- a/frontera/contrib/canonicalsolvers/__init__.py +++ b/frontera/contrib/canonicalsolvers/__init__.py @@ -1,5 +1,10 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import from . import basic from .common import CorporateWebsiteFriendly + Basic = basic.BasicCanonicalSolver + +__all__ = [ + "Basic", + "CorporateWebsiteFriendly", + "basic", +] diff --git a/frontera/contrib/canonicalsolvers/basic.py b/frontera/contrib/canonicalsolvers/basic.py index 944d8c6c1..a6f740a45 100644 --- a/frontera/contrib/canonicalsolvers/basic.py +++ b/frontera/contrib/canonicalsolvers/basic.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import from frontera.core.components import CanonicalSolver @@ -11,6 +9,7 @@ class BasicCanonicalSolver(CanonicalSolver): :attr:`page_crawled() `) at the price of duplicating records in Frontera for pages having more than one URL or complex redirects chains. """ + def frontier_start(self): pass @@ -32,15 +31,15 @@ def request_error(self, page, error): self._set_canonical(page) def _set_canonical(self, obj): - if b'redirect_urls' in obj.meta: - redirect_urls = obj.meta[b'redirect_urls'] - redirect_fingerprints = obj.meta[b'redirect_fingerprints'] + if b"redirect_urls" in obj.meta: + redirect_urls = obj.meta[b"redirect_urls"] + redirect_fingerprints = obj.meta[b"redirect_fingerprints"] redirect_urls.append(obj.url) - redirect_fingerprints.append(obj.meta[b'fingerprint']) + redirect_fingerprints.append(obj.meta[b"fingerprint"]) obj._url = redirect_urls[0] - obj.meta[b'fingerprint'] = redirect_fingerprints[0] + obj.meta[b"fingerprint"] = redirect_fingerprints[0] - if b'redirect_domains' in obj.meta: - redirect_domains = obj.meta[b'redirect_domains'] - redirect_domains.append(obj.meta[b'domain']) - obj.meta[b'domain'] = redirect_domains[0] + if b"redirect_domains" in obj.meta: + redirect_domains = obj.meta[b"redirect_domains"] + redirect_domains.append(obj.meta[b"domain"]) + obj.meta[b"domain"] = redirect_domains[0] diff --git a/frontera/contrib/canonicalsolvers/common.py b/frontera/contrib/canonicalsolvers/common.py index 7683dc00d..5763a540c 100644 --- a/frontera/contrib/canonicalsolvers/common.py +++ b/frontera/contrib/canonicalsolvers/common.py @@ -1,25 +1,23 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from .basic import BasicCanonicalSolver from frontera.utils.url import parse_url +from .basic import BasicCanonicalSolver + class CorporateWebsiteFriendly(BasicCanonicalSolver): - def _set_canonical(self, obj): - if b'redirect_urls' in obj.meta: + if b"redirect_urls" in obj.meta: # if home page is requested then leave the target page as canonical - urls = obj.meta[b'redirect_urls'] + urls = obj.meta[b"redirect_urls"] scheme, netloc, path, params, query, fragment = parse_url(urls[0]) - if not path or path in ['/', 'index.html', 'index.htm', 'default.htm']: + if not path or path in ["/", "index.html", "index.htm", "default.htm"]: return # check if redirect is within the same hostname target = parse_url(obj.url) - src_hostname, _, _ = netloc.partition(':') - trg_hostname, _, _ = target.netloc.partition(':') + src_hostname, _, _ = netloc.partition(":") + trg_hostname, _, _ = target.netloc.partition(":") if src_hostname == trg_hostname: return # otherwise default behavior - super(CorporateWebsiteFriendly, self)._set_canonical(obj) + super()._set_canonical(obj) diff --git a/frontera/contrib/messagebus/__init__.py b/frontera/contrib/messagebus/__init__.py index 7c68785e9..e69de29bb 100644 --- a/frontera/contrib/messagebus/__init__.py +++ b/frontera/contrib/messagebus/__init__.py @@ -1 +0,0 @@ -# -*- coding: utf-8 -*- \ No newline at end of file diff --git a/frontera/contrib/messagebus/kafka/__init__.py b/frontera/contrib/messagebus/kafka/__init__.py index 7c68785e9..e69de29bb 100644 --- a/frontera/contrib/messagebus/kafka/__init__.py +++ b/frontera/contrib/messagebus/kafka/__init__.py @@ -1 +0,0 @@ -# -*- coding: utf-8 -*- \ No newline at end of file diff --git a/frontera/contrib/messagebus/kafka/async.py b/frontera/contrib/messagebus/kafka/async_.py similarity index 69% rename from frontera/contrib/messagebus/kafka/async.py rename to frontera/contrib/messagebus/kafka/async_.py index 8ef89f4f0..3544e0d6e 100644 --- a/frontera/contrib/messagebus/kafka/async.py +++ b/frontera/contrib/messagebus/kafka/async_.py @@ -1,28 +1,26 @@ -from __future__ import absolute_import +import collections import copy import logging import time -import collections -import six + +from kafka import TopicPartition +from kafka import errors as Errors from kafka.client_async import KafkaClient -from kafka import errors as Errors, TopicPartition from kafka.future import Future from kafka.protocol.commit import GroupCoordinatorRequest, OffsetFetchRequest from kafka.protocol.offset import OffsetRequest from kafka.structs import OffsetAndMetadata - -log = logging.getLogger('offsets-fetcher') +log = logging.getLogger("offsets-fetcher") -class OffsetsFetcherAsync(object): - +class OffsetsFetcherAsync: DEFAULT_CONFIG = { - 'session_timeout_ms': 30000, - 'heartbeat_interval_ms': 3000, - 'retry_backoff_ms': 100, - 'api_version': (0, 9), - 'metric_group_prefix': '', + "session_timeout_ms": 30000, + "heartbeat_interval_ms": 3000, + "retry_backoff_ms": 100, + "api_version": (0, 9), + "metric_group_prefix": "", } def __init__(self, **configs): @@ -30,19 +28,18 @@ def __init__(self, **configs): self.config.update(configs) self._client = KafkaClient(**self.config) self._coordinator_id = None - self.group_id = configs['group_id'] - self.topic = configs['topic'] + self.group_id = configs["group_id"] + self.topic = configs["topic"] def _ensure_coordinator_known(self): """Block until the coordinator for this group is known (and we have an active connection -- java client uses unsent queue). """ while self._coordinator_unknown(): - # Prior to 0.8.2 there was no group coordinator # so we will just pick a node at random and treat # it as the "coordinator" - if self.config['api_version'] < (0, 8, 2): + if self.config["api_version"] < (0, 8, 2): self._coordinator_id = self._client.least_loaded_node() self._client.ready(self._coordinator_id) continue @@ -51,10 +48,11 @@ def _ensure_coordinator_known(self): self._client.poll(future=future) if future.failed(): - if isinstance(future.exception, - Errors.GroupCoordinatorNotAvailableError): + if isinstance( + future.exception, Errors.GroupCoordinatorNotAvailableError + ): continue - elif future.retriable(): + if future.retriable(): metadata_update = self._client.cluster.request_update() self._client.poll(future=metadata_update) else: @@ -80,8 +78,12 @@ def _coordinator_unknown(self): def _coordinator_dead(self, error=None): """Mark the current coordinator as dead.""" if self._coordinator_id is not None: - log.warning("Marking the coordinator dead (node %s) for group %s: %s.", - self._coordinator_id, self.group_id, error) + log.warning( + "Marking the coordinator dead (node %s) for group %s: %s.", + self._coordinator_id, + self.group_id, + error, + ) self._coordinator_id = None def _send_group_coordinator_request(self): @@ -94,8 +96,11 @@ def _send_group_coordinator_request(self): if node_id is None: return Future().failure(Errors.NoBrokersAvailable()) - log.debug("Sending group coordinator request for group %s to broker %s", - self.group_id, node_id) + log.debug( + "Sending group coordinator request for group %s to broker %s", + self.group_id, + node_id, + ) request = GroupCoordinatorRequest[0](self.group_id) future = Future() _f = self._client.send(node_id, request) @@ -121,8 +126,11 @@ def _handle_group_coordinator_response(self, future, response): return self._coordinator_id = response.coordinator_id - log.info("Discovered coordinator %s for group %s", - self._coordinator_id, self.group_id) + log.info( + "Discovered coordinator %s for group %s", + self._coordinator_id, + self.group_id, + ) self._client.ready(self._coordinator_id) future.success(self._coordinator_id) elif error_type is Errors.GroupCoordinatorNotAvailableError: @@ -134,17 +142,21 @@ def _handle_group_coordinator_response(self, future, response): future.failure(error) else: error = error_type() - log.error("Unrecognized failure in Group Coordinator Request: %s", - error) + log.error("Unrecognized failure in Group Coordinator Request: %s", error) future.failure(error) def _failed_request(self, node_id, request, future, error): - log.error('Error sending %s to node %s [%s]', - request.__class__.__name__, node_id, error) + log.error( + "Error sending %s to node %s [%s]", + request.__class__.__name__, + node_id, + error, + ) # Marking coordinator dead # unless the error is caused by internal client pipelining - if not isinstance(error, (Errors.NodeNotReadyError, - Errors.TooManyInFlightRequests)): + if not isinstance( + error, (Errors.NodeNotReadyError, Errors.TooManyInFlightRequests) + ): self._coordinator_dead() future.failure(error) @@ -169,8 +181,7 @@ def offsets(self, partitions, timestamp): self._client.poll(future=future) if future.succeeded(): - for tp, offset in future.value: - offsets[tp] = offset + offsets.update({tp: offset} for tp, offset in future.value) continue if not future.retriable(): @@ -199,12 +210,18 @@ def _send_offset_request(self, partitions, timestamp): for partition in partitions: node_id = self._client.cluster.leader_for_partition(partition) if node_id is None: - log.debug("Partition %s is unknown for fetching offset," - " wait for metadata refresh", partition) + log.debug( + "Partition %s is unknown for fetching offset," + " wait for metadata refresh", + partition, + ) return Future().failure(Errors.StaleMetadata(partition)) - elif node_id == -1: - log.debug("Leader for partition %s unavailable for fetching offset," - " wait for metadata refresh", partition) + if node_id == -1: + log.debug( + "Leader for partition %s unavailable for fetching offset," + " wait for metadata refresh", + partition, + ) return Future().failure(Errors.LeaderNotAvailableError(partition)) nodes_per_partitions.setdefault(node_id, []).append(partition) @@ -212,14 +229,25 @@ def _send_offset_request(self, partitions, timestamp): # so create a separate future and attach a callback to update it # based on response error codes futures = [] - for node_id, partitions in six.iteritems(nodes_per_partitions): + for node_id, partitions_ in nodes_per_partitions.items(): request = OffsetRequest[0]( - -1, [(topic, [(partition.partition, timestamp, 1) for partition in partitions])] + -1, + [ + ( + topic, + [ + (partition.partition, timestamp, 1) + for partition in partitions_ + ], + ) + ], ) future_request = Future() _f = self._client.send(node_id, request) - _f.add_callback(self._handle_offset_response, partitions, future_request) - _f.add_errback(lambda e: future_request.failure(e)) + _f.add_callback(self._handle_offset_response, partitions_, future_request) + _f.add_errback( + lambda e, future_request=future_request: future_request.failure(e) + ) futures.append(future_request) return futures @@ -236,27 +264,36 @@ def _handle_offset_response(self, partitions, future, response): """ topic, partition_info = response.topics[0] assert len(response.topics) == 1, ( - 'OffsetResponse should only be for a single topic') - partition_ids = set([part.partition for part in partitions]) + "OffsetResponse should only be for a single topic" + ) + partition_ids = {part.partition for part in partitions} result = [] for pi in partition_info: part, error_code, offsets = pi assert topic == partitions[0].topic and part in partition_ids, ( - 'OffsetResponse partition does not match OffsetRequest partition') + "OffsetResponse partition does not match OffsetRequest partition" + ) error_type = Errors.for_code(error_code) if error_type is Errors.NoError: - assert len(offsets) == 1, 'Expected OffsetResponse with one offset' + assert len(offsets) == 1, "Expected OffsetResponse with one offset" log.debug("Fetched offset %s for partition %d", offsets[0], part) result.append((TopicPartition(topic, part), offsets[0])) - elif error_type in (Errors.NotLeaderForPartitionError, - Errors.UnknownTopicOrPartitionError): - log.debug("Attempt to fetch offsets for partition %s failed due" - " to obsolete leadership information, retrying.", - str(partitions)) + elif error_type in ( + Errors.NotLeaderForPartitionError, + Errors.UnknownTopicOrPartitionError, + ): + log.debug( + "Attempt to fetch offsets for partition %s failed due" + " to obsolete leadership information, retrying.", + str(partitions), + ) future.failure(error_type(partitions)) else: - log.warning("Attempt to fetch offsets for partition %s failed due to:" - " %s", partitions, error_type) + log.warning( + "Attempt to fetch offsets for partition %s failed due to: %s", + partitions, + error_type, + ) future.failure(error_type(partitions)) future.success(result) @@ -285,7 +322,7 @@ def fetch_committed_offsets(self, partitions): if not future.retriable(): raise future.exception # pylint: disable-msg=raising-bad-type - time.sleep(self.config['retry_backoff_ms'] / 1000.0) + time.sleep(self.config["retry_backoff_ms"] / 1000.0) def _send_offset_fetch_request(self, partitions): """Fetch the committed offsets for a set of partitions. @@ -299,38 +336,38 @@ def _send_offset_fetch_request(self, partitions): Returns: Future: resolves to dict of offsets: {TopicPartition: int} """ - assert self.config['api_version'] >= (0, 8, 1), 'Unsupported Broker API' - assert all(map(lambda k: isinstance(k, TopicPartition), partitions)) + assert self.config["api_version"] >= (0, 8, 1), "Unsupported Broker API" + assert all(isinstance(k, TopicPartition) for k in partitions) if not partitions: return Future().success({}) - elif self._coordinator_unknown(): + if self._coordinator_unknown(): return Future().failure(Errors.GroupCoordinatorNotAvailableError) node_id = self._coordinator_id # Verify node is ready if not self._client.ready(node_id): - log.debug("Node %s not ready -- failing offset fetch request", - node_id) + log.debug("Node %s not ready -- failing offset fetch request", node_id) return Future().failure(Errors.NodeNotReadyError) - log.debug("Group %s fetching committed offsets for partitions: %s", - self.group_id, partitions) + log.debug( + "Group %s fetching committed offsets for partitions: %s", + self.group_id, + partitions, + ) # construct the request topic_partitions = collections.defaultdict(set) for tp in partitions: topic_partitions[tp.topic].add(tp.partition) - if self.config['api_version'] >= (0, 8, 2): + if self.config["api_version"] >= (0, 8, 2): request = OffsetFetchRequest[1]( - self.group_id, - list(topic_partitions.items()) + self.group_id, list(topic_partitions.items()) ) else: request = OffsetFetchRequest[0]( - self.group_id, - list(topic_partitions.items()) + self.group_id, list(topic_partitions.items()) ) # send the request with a callback @@ -348,8 +385,12 @@ def _handle_offset_fetch_response(self, future, response): error_type = Errors.for_code(error_code) if error_type is not Errors.NoError: error = error_type() - log.debug("Group %s failed to fetch offset for partition" - " %s: %s", self.group_id, tp, error) + log.debug( + "Group %s failed to fetch offset for partition %s: %s", + self.group_id, + tp, + error, + ) if error_type is Errors.GroupLoadInProgressError: # just retry future.failure(error) @@ -357,26 +398,34 @@ def _handle_offset_fetch_response(self, future, response): # re-discover the coordinator and retry self._coordinator_dead() future.failure(error) - elif error_type in (Errors.UnknownMemberIdError, - Errors.IllegalGenerationError): + elif error_type in ( + Errors.UnknownMemberIdError, + Errors.IllegalGenerationError, + ): future.failure(error) elif error_type is Errors.UnknownTopicOrPartitionError: - log.warning("OffsetFetchRequest -- unknown topic %s" - " (have you committed any offsets yet?)", - topic) + log.warning( + "OffsetFetchRequest -- unknown topic %s" + " (have you committed any offsets yet?)", + topic, + ) continue else: - log.error("Unknown error fetching offsets for %s: %s", - tp, error) + log.error( + "Unknown error fetching offsets for %s: %s", tp, error + ) future.failure(error) return - elif offset >= 0: + if offset >= 0: # record the position with the offset # (-1 indicates no committed offset to fetch) offsets[tp] = OffsetAndMetadata(offset, metadata) else: - log.debug("Group %s has no committed offset for partition" - " %s", self.group_id, tp) + log.debug( + "Group %s has no committed offset for partition %s", + self.group_id, + tp, + ) future.success(offsets) def get(self): @@ -386,15 +435,29 @@ def get(self): log.info("No partitions available, performing metadata update.") self._client.poll(future=future) return {} - partitions = [TopicPartition(self.topic, partition_id) for partition_id in topic_partitions] + partitions = [ + TopicPartition(self.topic, partition_id) + for partition_id in topic_partitions + ] offsets = self.offsets(partitions, -1) committed = self.fetch_committed_offsets(partitions) lags = {} - for tp, offset in six.iteritems(offsets): - commit_offset = committed[tp] if tp in committed else 0 - numerical = commit_offset if isinstance(commit_offset, int) else commit_offset.offset + for tp, offset in offsets.items(): + commit_offset = committed.get(tp, 0) + numerical = ( + commit_offset + if isinstance(commit_offset, int) + else commit_offset.offset + ) lag = offset - numerical pid = tp.partition if isinstance(tp, TopicPartition) else tp - log.debug("Lag for %s (%s): %s, %s, %s", self.topic, pid, offset, commit_offset, lag) + log.debug( + "Lag for %s (%s): %s, %s, %s", + self.topic, + pid, + offset, + commit_offset, + lag, + ) lags[pid] = lag - return lags \ No newline at end of file + return lags diff --git a/frontera/contrib/messagebus/kafkabus.py b/frontera/contrib/messagebus/kafkabus.py index 490262891..13348aac9 100644 --- a/frontera/contrib/messagebus/kafkabus.py +++ b/frontera/contrib/messagebus/kafkabus.py @@ -1,19 +1,24 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import - +import contextlib from logging import getLogger from time import sleep +from traceback import format_tb -import six from kafka import KafkaConsumer, KafkaProducer, TopicPartition - -from frontera.contrib.backends.partitioners import FingerprintPartitioner, Crc32NamePartitioner -from frontera.contrib.messagebus.kafka.async import OffsetsFetcherAsync -from frontera.core.messagebus import BaseMessageBus, BaseSpiderLogStream, BaseSpiderFeedStream, \ - BaseStreamConsumer, BaseScoringLogStream, BaseStreamProducer from twisted.internet.task import LoopingCall -from traceback import format_tb +from frontera.contrib.backends.partitioners import ( + Crc32NamePartitioner, + FingerprintPartitioner, +) +from frontera.contrib.messagebus.kafka.async_ import OffsetsFetcherAsync +from frontera.core.messagebus import ( + BaseMessageBus, + BaseScoringLogStream, + BaseSpiderFeedStream, + BaseSpiderLogStream, + BaseStreamConsumer, + BaseStreamProducer, +) logger = getLogger("messagebus.kafka") @@ -22,6 +27,7 @@ class Consumer(BaseStreamConsumer): """ Used in DB and SW worker. SW consumes per partition. """ + def __init__(self, location, topic, group, partition_id): self._location = location self._group = group @@ -31,7 +37,9 @@ def __init__(self, location, topic, group, partition_id): group_id=self._group, max_partition_fetch_bytes=10485760, consumer_timeout_ms=100, - client_id="%s-%s" % (self._topic, str(partition_id) if partition_id is not None else "all"), + client_id="{}-{}".format( + self._topic, str(partition_id) if partition_id is not None else "all" + ), request_timeout_ms=120 * 1000, ) @@ -39,7 +47,10 @@ def __init__(self, location, topic, group, partition_id): self._partition_ids = [TopicPartition(self._topic, partition_id)] self._consumer.assign(self._partition_ids) else: - self._partition_ids = [TopicPartition(self._topic, pid) for pid in self._consumer.partitions_for_topic(self._topic)] + self._partition_ids = [ + TopicPartition(self._topic, pid) + for pid in self._consumer.partitions_for_topic(self._topic) + ] self._consumer.subscribe(topics=[self._topic]) if self._consumer._use_consumer_group(): self._consumer._coordinator.ensure_coordinator_known() @@ -52,7 +63,7 @@ def _start_looping_call(self, interval=60): def errback(failure): logger.exception(failure.value) if failure.frames: - logger.critical(str("").join(format_tb(failure.getTracebackObject()))) + logger.critical("".join(format_tb(failure.getTracebackObject()))) self._poll_task.start(interval).addErrback(errback) self._poll_task = LoopingCall(self._poll_client) @@ -63,20 +74,18 @@ def _poll_client(self): def get_messages(self, timeout=0.1, count=1): result = [] - while count > 0: - try: + with contextlib.suppress(StopIteration): + while count > 0: m = next(self._consumer) result.append(m.value) count -= 1 - except StopIteration: - break return result def get_offset(self, partition_id): for tp in self._partition_ids: if tp.partition == partition_id: return self._consumer.position(tp) - raise KeyError("Can't find partition %d", partition_id) + raise KeyError(f"Can't find partition {partition_id}") def close(self): self._poll_task.stop() @@ -98,8 +107,11 @@ def __init__(self, location, topic, compression): self._create() def _create(self): - self._producer = KafkaProducer(bootstrap_servers=self._location, retries=5, - compression_type=self._compression) + self._producer = KafkaProducer( + bootstrap_servers=self._location, + retries=5, + compression_type=self._compression, + ) def send(self, key, *messages): for msg in messages: @@ -118,8 +130,12 @@ def __init__(self, location, topic_done, partitioner, compression): self._topic_done = topic_done self._partitioner = partitioner self._compression = compression - self._producer = KafkaProducer(bootstrap_servers=self._location, partitioner=partitioner, retries=5, - compression_type=self._compression) + self._producer = KafkaProducer( + bootstrap_servers=self._location, + partitioner=partitioner, + retries=5, + compression_type=self._compression, + ) def send(self, key, *messages): for msg in messages: @@ -142,8 +158,12 @@ def __init__(self, messagebus): self._partitions = messagebus.spider_log_partitions def producer(self): - return KeyedProducer(self._location, self._topic, FingerprintPartitioner(self._partitions), - self._codec) + return KeyedProducer( + self._location, + self._topic, + FingerprintPartitioner(self._partitions), + self._codec, + ) def consumer(self, partition_id, type): """ @@ -152,7 +172,7 @@ def consumer(self, partition_id, type): :param type: either 'db' or 'sw' :return: """ - group = self._sw_group if type == b'sw' else self._db_group + group = self._sw_group if type == b"sw" else self._db_group c = Consumer(self._location, self._topic, group, partition_id) assert len(c._consumer.partitions_for_topic(self._topic)) == self._partitions return c @@ -165,8 +185,11 @@ def __init__(self, messagebus): self._topic = messagebus.topic_todo self._max_next_requests = messagebus.max_next_requests self._hostname_partitioning = messagebus.hostname_partitioning - self._offset_fetcher = OffsetsFetcherAsync(bootstrap_servers=self._location, topic=self._topic, - group_id=self._general_group) + self._offset_fetcher = OffsetsFetcherAsync( + bootstrap_servers=self._location, + topic=self._topic, + group_id=self._general_group, + ) self._codec = messagebus.codec self._partitions = messagebus.spider_feed_partitions @@ -178,14 +201,17 @@ def consumer(self, partition_id): def available_partitions(self): partitions = [] lags = self._offset_fetcher.get() - for partition, lag in six.iteritems(lags): + for partition, lag in lags.items(): if lag < self._max_next_requests: partitions.append(partition) return partitions def producer(self): - partitioner = Crc32NamePartitioner(self._partitions) if self._hostname_partitioning \ + partitioner = ( + Crc32NamePartitioner(self._partitions) + if self._hostname_partitioning else FingerprintPartitioner(self._partitions) + ) return KeyedProducer(self._location, self._topic, partitioner, self._codec) @@ -205,21 +231,21 @@ def producer(self): class MessageBus(BaseMessageBus): def __init__(self, settings): - self.topic_todo = settings.get('SPIDER_FEED_TOPIC') - self.topic_done = settings.get('SPIDER_LOG_TOPIC') - self.topic_scoring = settings.get('SCORING_LOG_TOPIC') - - self.spiderlog_dbw_group = settings.get('SPIDER_LOG_DBW_GROUP') - self.spiderlog_sw_group = settings.get('SPIDER_LOG_SW_GROUP') - self.scoringlog_dbw_group = settings.get('SCORING_LOG_DBW_GROUP') - self.spider_feed_group = settings.get('SPIDER_FEED_GROUP') - self.spider_partition_id = settings.get('SPIDER_PARTITION_ID') + self.topic_todo = settings.get("SPIDER_FEED_TOPIC") + self.topic_done = settings.get("SPIDER_LOG_TOPIC") + self.topic_scoring = settings.get("SCORING_LOG_TOPIC") + + self.spiderlog_dbw_group = settings.get("SPIDER_LOG_DBW_GROUP") + self.spiderlog_sw_group = settings.get("SPIDER_LOG_SW_GROUP") + self.scoringlog_dbw_group = settings.get("SCORING_LOG_DBW_GROUP") + self.spider_feed_group = settings.get("SPIDER_FEED_GROUP") + self.spider_partition_id = settings.get("SPIDER_PARTITION_ID") self.max_next_requests = settings.MAX_NEXT_REQUESTS - self.hostname_partitioning = settings.get('QUEUE_HOSTNAME_PARTITIONING') - self.codec = settings.get('KAFKA_CODEC') - self.kafka_location = settings.get('KAFKA_LOCATION') - self.spider_log_partitions = settings.get('SPIDER_LOG_PARTITIONS') - self.spider_feed_partitions = settings.get('SPIDER_FEED_PARTITIONS') + self.hostname_partitioning = settings.get("QUEUE_HOSTNAME_PARTITIONING") + self.codec = settings.get("KAFKA_CODEC") + self.kafka_location = settings.get("KAFKA_LOCATION") + self.spider_log_partitions = settings.get("SPIDER_LOG_PARTITIONS") + self.spider_feed_partitions = settings.get("SPIDER_FEED_PARTITIONS") def spider_log(self): return SpiderLogStream(self) diff --git a/frontera/contrib/messagebus/zeromq/__init__.py b/frontera/contrib/messagebus/zeromq/__init__.py index ab1a56155..37106bcfe 100644 --- a/frontera/contrib/messagebus/zeromq/__init__.py +++ b/frontera/contrib/messagebus/zeromq/__init__.py @@ -1,34 +1,46 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from time import time, sleep -from struct import pack, unpack from logging import getLogger +from struct import pack, unpack +from time import sleep, time import zmq -import six -from frontera.core.messagebus import BaseMessageBus, BaseSpiderLogStream, BaseStreamConsumer, \ - BaseSpiderFeedStream, BaseScoringLogStream -from frontera.contrib.backends.partitioners import FingerprintPartitioner, Crc32NamePartitioner +from frontera.contrib.backends.partitioners import ( + Crc32NamePartitioner, + FingerprintPartitioner, +) from frontera.contrib.messagebus.zeromq.socket_config import SocketConfig -from six.moves import range +from frontera.core.messagebus import ( + BaseMessageBus, + BaseScoringLogStream, + BaseSpiderFeedStream, + BaseSpiderLogStream, + BaseStreamConsumer, +) class Consumer(BaseStreamConsumer): - def __init__(self, context, location, partition_id, identity, seq_warnings=False, hwm=1000): + def __init__( + self, context, location, partition_id, identity, seq_warnings=False, hwm=1000 + ): self.subscriber = context.zeromq.socket(zmq.SUB) self.subscriber.connect(location) self.subscriber.set(zmq.RCVHWM, hwm) - filter = identity + pack('>B', partition_id) if partition_id is not None else identity + filter = ( + identity + pack(">B", partition_id) + if partition_id is not None + else identity + ) self.subscriber.setsockopt(zmq.SUBSCRIBE, filter) self.counter = 0 self.count_global = partition_id is None - self.logger = getLogger("distributed_frontera.messagebus.zeromq.Consumer(%s-%s)" % (identity, partition_id)) + self.logger = getLogger( + f"distributed_frontera.messagebus.zeromq.Consumer({identity}-{partition_id})" + ) self.seq_warnings = seq_warnings self.stats = context.stats - self.stat_key = "consumer-%s" % identity + self.stat_key = f"consumer-{identity}" self.stats[self.stat_key] = 0 def get_messages(self, timeout=0.1, count=1): @@ -37,7 +49,7 @@ def get_messages(self, timeout=0.1, count=1): while count: try: msg = self.subscriber.recv_multipart(copy=True, flags=zmq.NOBLOCK) - except zmq.Again: + except zmq.Again: # noqa: PERF203 if time() - started > timeout: break sleep(sleep_time) @@ -48,8 +60,11 @@ def get_messages(self, timeout=0.1, count=1): self.counter = seqno elif self.counter != seqno: if self.seq_warnings: - self.logger.warning("Sequence counter mismatch: expected %d, got %d. Check if system " - "isn't missing messages." % (self.counter, seqno)) + self.logger.warning( + f"Sequence counter mismatch: expected " + f"{self.counter}, got {seqno}. Check if system " + f"isn't missing messages." + ) self.counter = None yield msg[1] count -= 1 @@ -61,7 +76,7 @@ def get_offset(self, partition_id): return self.counter -class Producer(object): +class Producer: def __init__(self, context, location, identity): self.identity = identity self.sender = context.zeromq.socket(zmq.PUB) @@ -69,7 +84,7 @@ def __init__(self, context, location, identity): self.counters = {} self.global_counter = 0 self.stats = context.stats - self.stat_key = "producer-%s" % identity + self.stat_key = f"producer-{identity}" self.stats[self.stat_key] = 0 def send(self, key, *messages): @@ -78,13 +93,18 @@ def send(self, key, *messages): raise TypeError("msg is not a list or tuple!") # Raise TypeError if any message is not encoded as bytes - if any(not isinstance(m, six.binary_type) for m in messages): + if any(not isinstance(m, bytes) for m in messages): raise TypeError("all produce message payloads must be type bytes") partition = self.partitioner.partition(key) counter = self.counters.get(partition, 0) for msg in messages: - self.sender.send_multipart([self.identity + pack(">B", partition), msg, - pack(">II", counter, self.global_counter)]) + self.sender.send_multipart( + [ + self.identity + pack(">B", partition), + msg, + pack(">II", counter, self.global_counter), + ] + ) counter += 1 self.global_counter += 1 if counter == 4294967296: @@ -103,7 +123,7 @@ def get_offset(self, partition_id): class SpiderLogProducer(Producer): def __init__(self, context, location, partitions): - super(SpiderLogProducer, self).__init__(context, location, b'sl') + super().__init__(context, location, b"sl") self.partitioner = FingerprintPartitioner(partitions) @@ -119,13 +139,13 @@ def producer(self): return SpiderLogProducer(self.context, self.out_location, self.partitions) def consumer(self, partition_id, type): - location = self.sw_in_location if type == b'sw' else self.db_in_location - return Consumer(self.context, location, partition_id, b'sl') + location = self.sw_in_location if type == b"sw" else self.db_in_location + return Consumer(self.context, location, partition_id, b"sl") class UpdateScoreProducer(Producer): def __init__(self, context, location): - super(UpdateScoreProducer, self).__init__(context, location, b'us') + super().__init__(context, location, b"us") def send(self, key, *messages): # Guarantee that msg is actually a list or tuple (should always be true) @@ -133,11 +153,13 @@ def send(self, key, *messages): raise TypeError("msg is not a list or tuple!") # Raise TypeError if any message is not encoded as bytes - if any(not isinstance(m, six.binary_type) for m in messages): + if any(not isinstance(m, bytes) for m in messages): raise TypeError("all produce message payloads must be type bytes") counter = self.counters.get(0, 0) for msg in messages: - self.sender.send_multipart([self.identity, msg, pack(">II", counter, counter)]) + self.sender.send_multipart( + [self.identity, msg, pack(">II", counter, counter)] + ) counter += 1 if counter == 4294967296: counter = 0 @@ -152,7 +174,7 @@ def __init__(self, messagebus): self.out_location = messagebus.socket_config.db_in() def consumer(self): - return Consumer(self.context, self.out_location, None, b'us') + return Consumer(self.context, self.out_location, None, b"us") def producer(self): return UpdateScoreProducer(self.context, self.in_location) @@ -160,9 +182,12 @@ def producer(self): class SpiderFeedProducer(Producer): def __init__(self, context, location, partitions, hwm, hostname_partitioning): - super(SpiderFeedProducer, self).__init__(context, location, b'sf') - self.partitioner = Crc32NamePartitioner(partitions) if hostname_partitioning else \ - FingerprintPartitioner(partitions) + super().__init__(context, location, b"sf") + self.partitioner = ( + Crc32NamePartitioner(partitions) + if hostname_partitioning + else FingerprintPartitioner(partitions) + ) self.sender.set(zmq.SNDHWM, hwm) @@ -178,11 +203,23 @@ def __init__(self, messagebus): self.hostname_partitioning = messagebus.hostname_partitioning def consumer(self, partition_id): - return Consumer(self.context, self.out_location, partition_id, b'sf', seq_warnings=True, hwm=self.consumer_hwm) + return Consumer( + self.context, + self.out_location, + partition_id, + b"sf", + seq_warnings=True, + hwm=self.consumer_hwm, + ) def producer(self): - return SpiderFeedProducer(self.context, self.in_location, self.partitions, - self.producer_hwm, self.hostname_partitioning) + return SpiderFeedProducer( + self.context, + self.in_location, + self.partitions, + self.producer_hwm, + self.hostname_partitioning, + ) def available_partitions(self): return self.ready_partitions @@ -194,8 +231,7 @@ def mark_busy(self, partition_id): self.ready_partitions.discard(partition_id) -class Context(object): - +class Context: zeromq = zmq.Context() stats = {} @@ -203,13 +239,18 @@ class Context(object): class MessageBus(BaseMessageBus): def __init__(self, settings): self.context = Context() - self.socket_config = SocketConfig(settings.get('ZMQ_ADDRESS'), - settings.get('ZMQ_BASE_PORT')) - self.spider_log_partitions = [i for i in range(settings.get('SPIDER_LOG_PARTITIONS'))] - self.spider_feed_partitions = [i for i in range(settings.get('SPIDER_FEED_PARTITIONS'))] - self.spider_feed_sndhwm = int(settings.get('MAX_NEXT_REQUESTS') * len(self.spider_feed_partitions) * 1.2) - self.spider_feed_rcvhwm = int(settings.get('MAX_NEXT_REQUESTS') * 2.0) - self.hostname_partitioning = settings.get('QUEUE_HOSTNAME_PARTITIONING') + self.socket_config = SocketConfig( + settings.get("ZMQ_ADDRESS"), settings.get("ZMQ_BASE_PORT") + ) + self.spider_log_partitions = list(range(settings.get("SPIDER_LOG_PARTITIONS"))) + self.spider_feed_partitions = list( + range(settings.get("SPIDER_FEED_PARTITIONS")) + ) + self.spider_feed_sndhwm = int( + settings.get("MAX_NEXT_REQUESTS") * len(self.spider_feed_partitions) * 1.2 + ) + self.spider_feed_rcvhwm = int(settings.get("MAX_NEXT_REQUESTS") * 2.0) + self.hostname_partitioning = settings.get("QUEUE_HOSTNAME_PARTITIONING") if self.socket_config.is_ipv6: self.context.zeromq.setsockopt(zmq.IPV6, True) diff --git a/frontera/contrib/messagebus/zeromq/broker.py b/frontera/contrib/messagebus/zeromq/broker.py index 9516d8416..ef8bad513 100644 --- a/frontera/contrib/messagebus/zeromq/broker.py +++ b/frontera/contrib/messagebus/zeromq/broker.py @@ -1,22 +1,20 @@ -# -*- coding: utf-8 -*- - -from __future__ import absolute_import -from time import time -from datetime import timedelta +import contextlib import logging from argparse import ArgumentParser +from datetime import timedelta from struct import unpack +from time import time import zmq from zmq.eventloop.ioloop import IOLoop from zmq.eventloop.zmqstream import ZMQStream from frontera.settings import Settings -from .socket_config import SocketConfig +from .socket_config import SocketConfig -class Server(object): +class Server: ctx = None loop = None stats = None @@ -31,13 +29,13 @@ def __init__(self, address, base_port): self.ctx = zmq.Context() self.loop = IOLoop.instance() self.stats = { - 'started': time(), - 'spiders_out_recvd': 0, - 'spiders_in_recvd': 0, - 'db_in_recvd': 0, - 'db_out_recvd': 0, - 'sw_in_recvd': 0, - 'sw_out_recvd': 0 + "started": time(), + "spiders_out_recvd": 0, + "spiders_in_recvd": 0, + "db_in_recvd": 0, + "db_out_recvd": 0, + "sw_in_recvd": 0, + "sw_out_recvd": 0, } socket_config = SocketConfig(address, base_port) @@ -73,20 +71,23 @@ def __init__(self, address, base_port): self.sw_in.on_recv(self.handle_sw_in_recv) self.db_in.on_recv(self.handle_db_in_recv) self.spiders_in.on_recv(self.handle_spiders_in_recv) - logging.basicConfig(format="%(asctime)s %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO) - self.logger = logging.getLogger("distributed_frontera.messagebus" - ".zeromq.broker.Server") - self.logger.info("Using socket: {}:{}".format(socket_config.ip_addr, - socket_config.base_port)) + logging.basicConfig( + format="%(asctime)s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO, + ) + self.logger = logging.getLogger( + "distributed_frontera.messagebus.zeromq.broker.Server" + ) + self.logger.info( + f"Using socket: {socket_config.ip_addr}:{socket_config.base_port}" + ) def start(self): self.logger.info("Distributed Frontera ZeroMQ broker is started.") self.log_stats() - try: + with contextlib.suppress(KeyboardInterrupt): self.loop.start() - except KeyboardInterrupt: - pass def log_stats(self): self.logger.info(self.stats) @@ -95,37 +96,37 @@ def log_stats(self): def handle_spiders_out_recv(self, msg): self.sw_in.send_multipart(msg) self.db_in.send_multipart(msg) - self.stats['spiders_out_recvd'] += 1 + self.stats["spiders_out_recvd"] += 1 def handle_sw_out_recv(self, msg): self.db_in.send_multipart(msg) - self.stats['sw_out_recvd'] += 1 + self.stats["sw_out_recvd"] += 1 def handle_db_out_recv(self, msg): self.spiders_in.send_multipart(msg) - self.stats['db_out_recvd'] += 1 + self.stats["db_out_recvd"] += 1 def handle_db_in_recv(self, msg): - self.stats['db_in_recvd'] += 1 - if b'\x01' in msg[0] or b'\x00' in msg[0]: + self.stats["db_in_recvd"] += 1 + if b"\x01" in msg[0] or b"\x00" in msg[0]: action, identity, partition_id = self.decode_subscription(msg[0]) - if identity == b'sl': + if identity == b"sl": self.spiders_out.send_multipart(msg) return - if identity == b'us': + if identity == b"us": self.sw_out.send_multipart(msg) return - raise AttributeError('Unknown identity in channel subscription.') + raise AttributeError("Unknown identity in channel subscription.") def handle_sw_in_recv(self, msg): - if b'\x01' in msg[0] or b'\x00' in msg[0]: + if b"\x01" in msg[0] or b"\x00" in msg[0]: self.spiders_out.send_multipart(msg) - self.stats['sw_in_recvd'] += 1 + self.stats["sw_in_recvd"] += 1 def handle_spiders_in_recv(self, msg): - if b'\x01' in msg[0] or b'\x00' in msg[0]: + if b"\x01" in msg[0] or b"\x00" in msg[0]: self.db_out.send_multipart(msg) - self.stats['spiders_in_recvd'] += 1 + self.stats["spiders_in_recvd"] += 1 def decode_subscription(self, msg): """ @@ -139,7 +140,7 @@ def decode_subscription(self, msg): """ if len(msg) == 4: return unpack(">B2sB", msg) - elif len(msg) == 3: + if len(msg) == 3: action, identity = unpack(">B2s", msg) return action, identity, None raise ValueError("Can't decode subscription correctly.") @@ -151,20 +152,29 @@ def main(): """ parser = ArgumentParser(description="Crawl frontier worker.") parser.add_argument( - '--config', type=str, - help='Settings module name, should be accessible by import.') + "--config", + type=str, + help="Settings module name, should be accessible by import.", + ) parser.add_argument( - '--address', type=str, - help='Hostname, IP address or Wildcard * to bind. Default is 127.0.0.1' - '. When binding to wildcard it defaults to IPv4.') + "--address", + type=str, + help="Hostname, IP address or Wildcard * to bind. Default is 127.0.0.1" + ". When binding to wildcard it defaults to IPv4.", + ) parser.add_argument( - '--log-level', '-L', type=str, default='INFO', - help='Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is' - ' INFO.') + "--log-level", + "-L", + type=str, + default="INFO", + help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL. Default is INFO.", + ) parser.add_argument( - '--port', type=int, - help='Base port number, server will bind to 6 ports starting from base' - '. Default is 5550') + "--port", + type=int, + help="Base port number, server will bind to 6 ports starting from base" + ". Default is 5550", + ) args = parser.parse_args() settings = Settings(module=args.config) @@ -175,5 +185,5 @@ def main(): server.start() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/frontera/contrib/messagebus/zeromq/socket_config.py b/frontera/contrib/messagebus/zeromq/socket_config.py index 6ecddf842..fe9e5d632 100644 --- a/frontera/contrib/messagebus/zeromq/socket_config.py +++ b/frontera/contrib/messagebus/zeromq/socket_config.py @@ -1,63 +1,63 @@ -# -*- coding: utf-8 -*- """ Contains the SocketConfig class """ -from __future__ import absolute_import -from socket import getaddrinfo, gaierror +from socket import gaierror, getaddrinfo -class SocketConfig(object): + +class SocketConfig: """ Converts address to IPv4 or IPv6 or * and returns the necessary socket addresses. NOTE: When using * it defaults to IPv4 """ + def __init__(self, address, base_port): - if address == '*': - self.ip_addr = '*' + if address == "*": + self.ip_addr = "*" self.base_port = base_port self.is_ipv6 = False else: try: addr_tuple = getaddrinfo(address, base_port)[0][4] - except gaierror: - raise gaierror("Hostname '%s' could not be resolved" % address) + except gaierror as e: + raise gaierror(f"Hostname '{address}' could not be resolved") from e self.ip_addr = addr_tuple[0] self.base_port = addr_tuple[1] - self.is_ipv6 = True if len(addr_tuple) == 4 else False + self.is_ipv6 = len(addr_tuple) == 4 def spiders_in(self): """ TCP socket for incoming spider messages """ - return 'tcp://%s:%d' % (self.ip_addr, self.base_port) + return f"tcp://{self.ip_addr}:{self.base_port}" def spiders_out(self): """ TCP socket for outgoing spider messages """ - return 'tcp://%s:%d' % (self.ip_addr, self.base_port + 1) + return f"tcp://{self.ip_addr}:{self.base_port + 1}" def sw_in(self): """ TCP socket for incoming SW messages """ - return 'tcp://%s:%d' % (self.ip_addr, self.base_port + 2) + return f"tcp://{self.ip_addr}:{self.base_port + 2}" def sw_out(self): """ TCP socket for outgoing SW messages """ - return 'tcp://%s:%d' % (self.ip_addr, self.base_port + 3) + return f"tcp://{self.ip_addr}:{self.base_port + 3}" def db_in(self): """ TCP socket for incoming messages """ - return 'tcp://%s:%d' % (self.ip_addr, self.base_port + 4) + return f"tcp://{self.ip_addr}:{self.base_port + 4}" def db_out(self): """ TCP socket for outgoing DW messages """ - return 'tcp://%s:%d' % (self.ip_addr, self.base_port + 5) + return f"tcp://{self.ip_addr}:{self.base_port + 5}" diff --git a/frontera/contrib/middlewares/domain.py b/frontera/contrib/middlewares/domain.py index c8a0f117a..8626669f9 100644 --- a/frontera/contrib/middlewares/domain.py +++ b/frontera/contrib/middlewares/domain.py @@ -1,10 +1,10 @@ -from __future__ import absolute_import import re -from frontera.core.components import Middleware -from frontera.utils.url import parse_domain_from_url_fast, parse_domain_from_url from w3lib.util import to_bytes +from frontera.core.components import Middleware +from frontera.utils.url import parse_domain_from_url, parse_domain_from_url_fast + # TODO: Why not to put the whole url_parse result here in meta? @@ -57,12 +57,15 @@ class DomainMiddleware(Middleware): .. _`RFC 1808`: http://tools.ietf.org/html/rfc1808.html """ - component_name = 'Domain Middleware' + + component_name = "Domain Middleware" def __init__(self, manager): self.manager = manager - use_tldextract = self.manager.settings.get('TLDEXTRACT_DOMAIN_INFO', False) - self.parse_domain_func = parse_domain_from_url if use_tldextract else parse_domain_from_url_fast + use_tldextract = self.manager.settings.get("TLDEXTRACT_DOMAIN_INFO", False) + self.parse_domain_func = ( + parse_domain_from_url if use_tldextract else parse_domain_from_url_fast + ) @classmethod def from_manager(cls, manager): @@ -91,24 +94,26 @@ def request_error(self, request, error): return self._add_domain(request) def _add_domain(self, obj): - obj.meta[b'domain'] = self.parse_domain_info(obj.url, self.manager.test_mode) - if b'redirect_urls' in obj.meta: - obj.meta[b'redirect_domains'] = [self.parse_domain_info(url, self.manager.test_mode) - for url in obj.meta[b'redirect_urls']] + obj.meta[b"domain"] = self.parse_domain_info(obj.url, self.manager.test_mode) + if b"redirect_urls" in obj.meta: + obj.meta[b"redirect_domains"] = [ + self.parse_domain_info(url, self.manager.test_mode) + for url in obj.meta[b"redirect_urls"] + ] return obj def parse_domain_info(self, url, test_mode=False): if test_mode: - match = re.match('([A-Z])\w+', url) - netloc = name = to_bytes(match.groups()[0]) if match else b'?' - scheme = sld = tld = subdomain = b'-' + match = re.match(r"([A-Z])\w+", url) + netloc = name = to_bytes(match.groups()[0]) if match else b"?" + scheme = sld = tld = subdomain = b"-" else: netloc, name, scheme, sld, tld, subdomain = self.parse_domain_func(url) return { - b'netloc': to_bytes(netloc), - b'name': to_bytes(name), - b'scheme': to_bytes(scheme), - b'sld': to_bytes(sld), - b'tld': to_bytes(tld), - b'subdomain': to_bytes(subdomain), + b"netloc": to_bytes(netloc), + b"name": to_bytes(name), + b"scheme": to_bytes(scheme), + b"sld": to_bytes(sld), + b"tld": to_bytes(tld), + b"subdomain": to_bytes(subdomain), } diff --git a/frontera/contrib/middlewares/fingerprint.py b/frontera/contrib/middlewares/fingerprint.py index bb08fca7c..c51314375 100644 --- a/frontera/contrib/middlewares/fingerprint.py +++ b/frontera/contrib/middlewares/fingerprint.py @@ -1,16 +1,18 @@ -from __future__ import absolute_import +from w3lib.url import canonicalize_url + from frontera.core.components import Middleware from frontera.exceptions import NotConfigured -from w3lib.url import canonicalize_url from frontera.utils.misc import load_object class BaseFingerprintMiddleware(Middleware): - component_name = 'Base Fingerprint Middleware' - fingerprint_function_name = '' + component_name = "Base Fingerprint Middleware" + fingerprint_function_name = "" def __init__(self, manager): - fingerprint_function_name = manager.settings.get(self.fingerprint_function_name, None) + fingerprint_function_name = manager.settings.get( + self.fingerprint_function_name, None + ) if not fingerprint_function_name: raise NotConfigured self.fingerprint_function = load_object(fingerprint_function_name) @@ -66,16 +68,18 @@ class UrlFingerprintMiddleware(BaseFingerprintMiddleware): """ - component_name = 'URL Fingerprint Middleware' - fingerprint_function_name = 'URL_FINGERPRINT_FUNCTION' + component_name = "URL Fingerprint Middleware" + fingerprint_function_name = "URL_FINGERPRINT_FUNCTION" def _get_fingerprint(self, url): return self.fingerprint_function(canonicalize_url(url)) def _add_fingerprint(self, obj): - obj.meta[b'fingerprint'] = self._get_fingerprint(obj.url) - if b'redirect_urls' in obj.meta: - obj.meta[b'redirect_fingerprints'] = [self._get_fingerprint(url) for url in obj.meta[b'redirect_urls']] + obj.meta[b"fingerprint"] = self._get_fingerprint(obj.url) + if b"redirect_urls" in obj.meta: + obj.meta[b"redirect_fingerprints"] = [ + self._get_fingerprint(url) for url in obj.meta[b"redirect_urls"] + ] return obj @@ -108,13 +112,15 @@ class DomainFingerprintMiddleware(BaseFingerprintMiddleware): """ - component_name = 'Domain Fingerprint Middleware' - fingerprint_function_name = 'DOMAIN_FINGERPRINT_FUNCTION' + component_name = "Domain Fingerprint Middleware" + fingerprint_function_name = "DOMAIN_FINGERPRINT_FUNCTION" def _add_fingerprint(self, obj): - if b'domain' in obj.meta and b'name' in obj.meta[b'domain']: - obj.meta[b'domain'][b'fingerprint'] = self.fingerprint_function(obj.meta[b'domain'][b'name']) - if b'redirect_domains' in obj.meta: - for domain in obj.meta[b'redirect_domains']: - domain[b'fingerprint'] = self.fingerprint_function(domain[b'name']) + if b"domain" in obj.meta and b"name" in obj.meta[b"domain"]: + obj.meta[b"domain"][b"fingerprint"] = self.fingerprint_function( + obj.meta[b"domain"][b"name"] + ) + if b"redirect_domains" in obj.meta: + for domain in obj.meta[b"redirect_domains"]: + domain[b"fingerprint"] = self.fingerprint_function(domain[b"name"]) return obj diff --git a/frontera/contrib/requests/converters.py b/frontera/contrib/requests/converters.py index 8e1059376..eb891e129 100644 --- a/frontera/contrib/requests/converters.py +++ b/frontera/contrib/requests/converters.py @@ -1,6 +1,4 @@ -from __future__ import absolute_import from requests.models import Request as RequestsRequest -from requests.models import Response as RequestsResponse from frontera.core.models import Request as FrontierRequest from frontera.core.models import Response as FrontierResponse @@ -9,33 +7,41 @@ class RequestConverter(BaseRequestConverter): """Converts between frontera and Requests request objects""" + def to_frontier(self, request): """request: Requests > Frontier""" - return FrontierRequest(url=request.url, - method=request.method, - headers=request.headers, - cookies=request.cookies if hasattr(request, 'cookies') else {}) + return FrontierRequest( + url=request.url, + method=request.method, + headers=request.headers, + cookies=request.cookies if hasattr(request, "cookies") else {}, + ) def from_frontier(self, request): """request: Frontier > Scrapy""" - return RequestsRequest(url=request.url, - method=request.method, - headers=request.headers, - cookies=request.cookies) + return RequestsRequest( + url=request.url, + method=request.method, + headers=request.headers, + cookies=request.cookies, + ) class ResponseConverter(BaseResponseConverter): """Converts between frontera and Scrapy response objects""" + def __init__(self, request_converter): self._request_converter = request_converter def to_frontier(self, response): """response: Scrapy > Frontier""" - return FrontierResponse(url=response.url, - status_code=response.status_code, - headers=response.headers, - body=response.text, - request=self._request_converter.to_frontier(response.request)) + return FrontierResponse( + url=response.url, + status_code=response.status_code, + headers=response.headers, + body=response.text, + request=self._request_converter.to_frontier(response.request), + ) def from_frontier(self, response): """response: Frontier > Scrapy""" diff --git a/frontera/contrib/requests/manager.py b/frontera/contrib/requests/manager.py index fa48fa783..a023193ac 100644 --- a/frontera/contrib/requests/manager.py +++ b/frontera/contrib/requests/manager.py @@ -1,11 +1,10 @@ -from __future__ import absolute_import from frontera.utils.managers import FrontierManagerWrapper + from .converters import RequestConverter, ResponseConverter class RequestsFrontierManager(FrontierManagerWrapper): - def __init__(self, settings): - super(RequestsFrontierManager, self).__init__(settings) + super().__init__(settings) self.request_converter = RequestConverter() self.response_converter = ResponseConverter(self.request_converter) diff --git a/frontera/contrib/scrapy/converters.py b/frontera/contrib/scrapy/converters.py index fc013150d..6024adc23 100644 --- a/frontera/contrib/scrapy/converters.py +++ b/frontera/contrib/scrapy/converters.py @@ -1,15 +1,18 @@ -from __future__ import absolute_import +import functools +import operator + from scrapy.http.request import Request as ScrapyRequest from scrapy.http.response import Response as ScrapyResponse +from w3lib.util import to_bytes, to_unicode from frontera.core.models import Request as FrontierRequest from frontera.core.models import Response as FrontierResponse from frontera.utils.converters import BaseRequestConverter, BaseResponseConverter -from w3lib.util import to_bytes, to_native_str class RequestConverter(BaseRequestConverter): """Converts between frontera and Scrapy request objects""" + def __init__(self, spider): self.spider = spider @@ -18,7 +21,11 @@ def to_frontier(self, scrapy_request): if isinstance(scrapy_request.cookies, dict): cookies = scrapy_request.cookies else: - cookies = dict(sum([list(d.items()) for d in scrapy_request.cookies], [])) + cookies = dict( + functools.reduce( + operator.iadd, [list(d.items()) for d in scrapy_request.cookies], [] + ) + ) cb = scrapy_request.callback if callable(cb): cb = _find_method(self.spider, cb) @@ -28,87 +35,99 @@ def to_frontier(self, scrapy_request): scrapy_meta = scrapy_request.meta meta = {} - if b'frontier_request' in scrapy_meta: - request = scrapy_meta[b'frontier_request'] + if b"frontier_request" in scrapy_meta: + request = scrapy_meta[b"frontier_request"] if isinstance(request, FrontierRequest): meta = request.meta - del scrapy_meta[b'frontier_request'] - - meta.update({ - b'scrapy_callback': cb, - b'scrapy_errback': eb, - b'scrapy_meta': scrapy_meta, - b'origin_is_frontier': True, - }) - if 'redirect_urls' in scrapy_meta: - meta[b'redirect_urls'] = scrapy_meta['redirect_urls'] - return FrontierRequest(url=scrapy_request.url, - method=scrapy_request.method, - headers=scrapy_request.headers, - cookies=cookies, - meta=meta, - body=scrapy_request.body) + del scrapy_meta[b"frontier_request"] + + meta.update( + { + b"scrapy_callback": cb, + b"scrapy_errback": eb, + b"scrapy_meta": scrapy_meta, + b"origin_is_frontier": True, + } + ) + if "redirect_urls" in scrapy_meta: + meta[b"redirect_urls"] = scrapy_meta["redirect_urls"] + return FrontierRequest( + url=scrapy_request.url, + method=scrapy_request.method, + headers=scrapy_request.headers, + cookies=cookies, + meta=meta, + body=scrapy_request.body, + ) def from_frontier(self, frontier_request): """request: Frontier > Scrapy""" - cb = frontier_request.meta.get(b'scrapy_callback', None) + cb = frontier_request.meta.get(b"scrapy_callback", None) if cb and self.spider: cb = _get_method(self.spider, cb) - eb = frontier_request.meta.get(b'scrapy_errback', None) + eb = frontier_request.meta.get(b"scrapy_errback", None) if eb and self.spider: eb = _get_method(self.spider, eb) body = frontier_request.body - meta = frontier_request.meta.get(b'scrapy_meta', {}) - meta[b'frontier_request'] = frontier_request - return ScrapyRequest(url=frontier_request.url, - callback=cb, - errback=eb, - body=body, - method=to_native_str(frontier_request.method), - headers=frontier_request.headers, - cookies=frontier_request.cookies, - meta=meta, - dont_filter=True) + meta = frontier_request.meta.get(b"scrapy_meta", {}) + meta[b"frontier_request"] = frontier_request + return ScrapyRequest( + url=frontier_request.url, + callback=cb, + errback=eb, + body=body, + method=to_unicode(frontier_request.method), + headers=frontier_request.headers, + cookies=frontier_request.cookies, + meta=meta, + dont_filter=True, + ) class ResponseConverter(BaseResponseConverter): """Converts between frontera and Scrapy response objects""" + def __init__(self, spider, request_converter): self.spider = spider self._request_converter = request_converter def to_frontier(self, scrapy_response): """response: Scrapy > Frontier""" - frontier_request = scrapy_response.meta[b'frontier_request'] - frontier_request.meta[b'scrapy_meta'] = scrapy_response.meta - if 'redirect_urls' in scrapy_response.meta: - frontier_request.meta[b'redirect_urls'] = scrapy_response.meta['redirect_urls'] - del scrapy_response.meta[b'frontier_request'] - return FrontierResponse(url=scrapy_response.url, - status_code=scrapy_response.status, - headers=scrapy_response.headers, - body=scrapy_response.body, - request=frontier_request) + frontier_request = scrapy_response.meta[b"frontier_request"] + frontier_request.meta[b"scrapy_meta"] = scrapy_response.meta + if "redirect_urls" in scrapy_response.meta: + frontier_request.meta[b"redirect_urls"] = scrapy_response.meta[ + "redirect_urls" + ] + del scrapy_response.meta[b"frontier_request"] + return FrontierResponse( + url=scrapy_response.url, + status_code=scrapy_response.status, + headers=scrapy_response.headers, + body=scrapy_response.body, + request=frontier_request, + ) def from_frontier(self, response): """response: Frontier > Scrapy""" - return ScrapyResponse(url=response.url, - status=response.status_code, - headers=response.headers, - body=response.body, - request=self._request_converter.from_frontier(response.request)) + return ScrapyResponse( + url=response.url, + status=response.status_code, + headers=response.headers, + body=response.body, + request=self._request_converter.from_frontier(response.request), + ) def _find_method(obj, func): - if obj and hasattr(func, '__self__') and func.__self__ is obj: + if obj and hasattr(func, "__self__") and func.__self__ is obj: return to_bytes(func.__func__.__name__) - else: - raise ValueError("Function %s is not a method of: %s" % (func, obj)) + raise ValueError(f"Function {func} is not a method of: {obj}") def _get_method(obj, name): - name = to_native_str(name) + name = to_unicode(name) try: return getattr(obj, name) - except AttributeError: - raise ValueError("Method %r not found in: %s" % (name, obj)) + except AttributeError as e: + raise ValueError(f"Method {name!r} not found in: {obj}") from e diff --git a/frontera/contrib/scrapy/manager.py b/frontera/contrib/scrapy/manager.py index 2cae44c46..6fe3bbaf5 100644 --- a/frontera/contrib/scrapy/manager.py +++ b/frontera/contrib/scrapy/manager.py @@ -1,14 +1,15 @@ -from __future__ import absolute_import from frontera.utils.managers import FrontierManagerWrapper + from .converters import RequestConverter, ResponseConverter class ScrapyFrontierManager(FrontierManagerWrapper): - spider = None def set_spider(self, spider): - assert self.spider is None, 'Spider is already set. Only one spider is supported per process.' + assert self.spider is None, ( + "Spider is already set. Only one spider is supported per process." + ) self.spider = spider self.request_converter = RequestConverter(self.spider) self.response_converter = ResponseConverter(self.spider, self.request_converter) diff --git a/frontera/contrib/scrapy/middlewares/schedulers.py b/frontera/contrib/scrapy/middlewares/schedulers.py index 6e1e0d3ea..ba7610514 100644 --- a/frontera/contrib/scrapy/middlewares/schedulers.py +++ b/frontera/contrib/scrapy/middlewares/schedulers.py @@ -1,7 +1,4 @@ - - -class BaseSchedulerMiddleware(object): - +class BaseSchedulerMiddleware: def __init__(self, crawler): self.crawler = crawler @@ -11,7 +8,12 @@ def from_crawler(cls, crawler): @property def scheduler(self): - return self.crawler.engine.slot.scheduler + try: + # To be exposed as engine.scheduler as part of + # https://github.com/scrapy/scrapy/pull/6715 + return self.crawler.engine._slot.scheduler + except AttributeError: # Scrapy < 2.13.0 + return self.crawler.engine.slot.scheduler class SchedulerSpiderMiddleware(BaseSchedulerMiddleware): diff --git a/frontera/contrib/scrapy/middlewares/seeds/__init__.py b/frontera/contrib/scrapy/middlewares/seeds/__init__.py index 09cd0b7cd..c260e9811 100644 --- a/frontera/contrib/scrapy/middlewares/seeds/__init__.py +++ b/frontera/contrib/scrapy/middlewares/seeds/__init__.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import +from scrapy import Request -class SeedLoader(object): +class SeedLoader: def __init__(self, crawler): self.crawler = crawler self.configure(crawler.settings) @@ -14,11 +14,8 @@ def from_crawler(cls, crawler): return cls(crawler) def process_start_requests(self, start_requests, spider): - urls = [url for url in self.load_seeds() if not url.startswith('#')] - return [spider.make_requests_from_url(url) for url in urls] + urls = [url for url in self.load_seeds() if not url.startswith("#")] + return [Request(url, dont_filter=True) for url in urls] def load_seeds(self): raise NotImplementedError - - - diff --git a/frontera/contrib/scrapy/middlewares/seeds/file.py b/frontera/contrib/scrapy/middlewares/seeds/file.py index c70953de0..8233bb5a7 100644 --- a/frontera/contrib/scrapy/middlewares/seeds/file.py +++ b/frontera/contrib/scrapy/middlewares/seeds/file.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import import codecs from scrapy.exceptions import NotConfigured @@ -8,7 +7,7 @@ class FileSeedLoader(SeedLoader): def configure(self, settings): - self.seeds_source = settings.get('SEEDS_SOURCE') + self.seeds_source = settings.get("SEEDS_SOURCE") if not self.seeds_source: raise NotConfigured @@ -17,8 +16,8 @@ def load_seeds(self): return self.load_seeds_from_file(self.seeds_source) def load_seeds_from_file(self, file_path): - with codecs.open(file_path, 'rU') as f: - return self.load_seeds_from_data((f)) + with codecs.open(file_path, "rU") as f: + return self.load_seeds_from_data(f) def load_seeds_from_data(self, data): seeds = [] @@ -29,4 +28,4 @@ def load_seeds_from_data(self, data): return seeds def clean_seed(self, url): - return url.strip('\t\n\r') + return url.strip("\t\n\r") diff --git a/frontera/contrib/scrapy/middlewares/seeds/s3.py b/frontera/contrib/scrapy/middlewares/seeds/s3.py index abaf3c3b6..62f651bd5 100644 --- a/frontera/contrib/scrapy/middlewares/seeds/s3.py +++ b/frontera/contrib/scrapy/middlewares/seeds/s3.py @@ -1,5 +1,5 @@ -from __future__ import absolute_import -from six.moves.urllib.parse import urlparse +from urllib.parse import urlparse + from boto import connect_s3 from scrapy.exceptions import NotConfigured @@ -8,23 +8,22 @@ class S3SeedLoader(FileSeedLoader): def configure(self, settings): - source = settings.get('SEEDS_SOURCE') + source = settings.get("SEEDS_SOURCE") u = urlparse(source) - if not u.hostname or not u.scheme == 's3': + if not u.hostname or not u.scheme == "s3": raise NotConfigured self.bucket_name = u.hostname - self.bucket_keys_prefix = u.path.lstrip('/') - self.s3_aws_access_key = settings.get('SEEDS_AWS_ACCESS_KEY') - self.s3_aws_secret_key = settings.get('SEEDS_AWS_SECRET_ACCESS_KEY') + self.bucket_keys_prefix = u.path.lstrip("/") + self.s3_aws_access_key = settings.get("SEEDS_AWS_ACCESS_KEY") + self.s3_aws_secret_key = settings.get("SEEDS_AWS_SECRET_ACCESS_KEY") def load_seeds(self): - conn = connect_s3(self.s3_aws_access_key, - self.s3_aws_secret_key) + conn = connect_s3(self.s3_aws_access_key, self.s3_aws_secret_key) bucket = conn.get_bucket(self.bucket_name) seeds = [] for key in bucket.list(self.bucket_keys_prefix): if key.name.endswith(".txt"): - data = key.get_contents_as_string(encoding='utf-8').split() + data = key.get_contents_as_string(encoding="utf-8").split() file_seeds = self.load_seeds_from_data(data) seeds.extend(file_seeds) return seeds diff --git a/frontera/contrib/scrapy/overusedbuffer.py b/frontera/contrib/scrapy/overusedbuffer.py index 7a5a125e6..ec6bf6b8d 100644 --- a/frontera/contrib/scrapy/overusedbuffer.py +++ b/frontera/contrib/scrapy/overusedbuffer.py @@ -1,6 +1,5 @@ -from __future__ import absolute_import -from scrapy.utils.httpobj import urlparse_cached from scrapy.resolver import dnscache +from scrapy.utils.httpobj import urlparse_cached from frontera.core import OverusedBuffer @@ -11,7 +10,7 @@ class OverusedBufferScrapy(OverusedBuffer): """ def _get_key(self, request, type): - key = urlparse_cached(request).hostname or '' - if type == 'ip': + key = urlparse_cached(request).hostname or "" + if type == "ip": key = dnscache.get(key, key) return key diff --git a/frontera/contrib/scrapy/schedulers/frontier.py b/frontera/contrib/scrapy/schedulers/frontier.py index f83f08cfa..cc5e35c00 100644 --- a/frontera/contrib/scrapy/schedulers/frontier.py +++ b/frontera/contrib/scrapy/schedulers/frontier.py @@ -1,68 +1,67 @@ -from __future__ import absolute_import -from scrapy.core.scheduler import Scheduler -from scrapy.http import Request -from logging import getLogger - from collections import deque +from logging import getLogger from time import time +from scrapy.core.scheduler import Scheduler +from scrapy.http import Request + from frontera.contrib.scrapy.manager import ScrapyFrontierManager from frontera.contrib.scrapy.settings_adapter import ScrapySettingsAdapter -import six -STATS_PREFIX = 'frontera' +STATS_PREFIX = "frontera" -class StatsManager(object): +class StatsManager: """ - 'frontera/crawled_pages_count': 489, - 'frontera/crawled_pages_count/200': 382, - 'frontera/crawled_pages_count/301': 37, - 'frontera/crawled_pages_count/302': 58, - 'frontera/crawled_pages_count/400': 5, - 'frontera/crawled_pages_count/403': 1, - 'frontera/crawled_pages_count/404': 1, - 'frontera/crawled_pages_count/999': 5, - 'frontera/iterations': 5, - 'frontera/links_extracted_count': 39805, - 'frontera/pending_requests_count': 0, - 'frontera/redirected_requests_count': 273, - 'frontera/request_errors_count': 11, - 'frontera/request_errors_count/DNSLookupError': 1, - 'frontera/request_errors_count/ResponseNeverReceived': 9, - 'frontera/request_errors_count/TimeoutError': 1, - 'frontera/returned_requests_count': 500, + 'frontera/crawled_pages_count': 489, + 'frontera/crawled_pages_count/200': 382, + 'frontera/crawled_pages_count/301': 37, + 'frontera/crawled_pages_count/302': 58, + 'frontera/crawled_pages_count/400': 5, + 'frontera/crawled_pages_count/403': 1, + 'frontera/crawled_pages_count/404': 1, + 'frontera/crawled_pages_count/999': 5, + 'frontera/iterations': 5, + 'frontera/links_extracted_count': 39805, + 'frontera/pending_requests_count': 0, + 'frontera/redirected_requests_count': 273, + 'frontera/request_errors_count': 11, + 'frontera/request_errors_count/DNSLookupError': 1, + 'frontera/request_errors_count/ResponseNeverReceived': 9, + 'frontera/request_errors_count/TimeoutError': 1, + 'frontera/returned_requests_count': 500, """ + def __init__(self, stats, prefix=STATS_PREFIX): self.stats = stats self.prefix = prefix def add_seeds(self, count=1): - self._inc_value('seeds_count', count) + self._inc_value("seeds_count", count) def add_crawled_page(self, status_code, n_links): - self._inc_value('crawled_pages_count') - self._inc_value('crawled_pages_count/%s' % str(status_code)) - self._inc_value('links_extracted_count', n_links) + self._inc_value("crawled_pages_count") + self._inc_value(f"crawled_pages_count/{status_code!s}") + self._inc_value("links_extracted_count", n_links) def add_redirected_requests(self, count=1): - self._inc_value('redirected_requests_count', count) + self._inc_value("redirected_requests_count", count) def add_returned_requests(self, count=1): - self._inc_value('returned_requests_count', count) + self._inc_value("returned_requests_count", count) def add_request_error(self, error_code): - self._inc_value('request_errors_count') - self._inc_value('request_errors_count/%s' % str(error_code)) + self._inc_value("request_errors_count") + self._inc_value(f"request_errors_count/{error_code!s}") def set_iterations(self, iterations): - self._set_value('iterations', iterations) + self._set_value("iterations", iterations) def set_pending_requests(self, pending_requests): - self._set_value('pending_requests_count', pending_requests) + self._set_value("pending_requests_count", pending_requests) def _get_stats_name(self, variable): - return '%s/%s' % (self.prefix, variable) + return f"{self.prefix}/{variable}" def _inc_value(self, variable, count=1): self.stats.inc_value(self._get_stats_name(variable), count) @@ -72,17 +71,16 @@ def _set_value(self, variable, value): class FronteraScheduler(Scheduler): - def __init__(self, crawler, manager=None): self.crawler = crawler self.stats_manager = StatsManager(crawler.stats) self._pending_requests = deque() - self.redirect_enabled = crawler.settings.get('REDIRECT_ENABLED') + self.redirect_enabled = crawler.settings.get("REDIRECT_ENABLED") settings = ScrapySettingsAdapter(crawler.settings) self.frontier = ScrapyFrontierManager(settings, manager) - self._delay_on_empty = self.frontier.manager.settings.get('DELAY_ON_EMPTY') + self._delay_on_empty = self.frontier.manager.settings.get("DELAY_ON_EMPTY") self._delay_next_call = 0.0 - self.logger = getLogger('frontera.contrib.scrapy.schedulers.FronteraScheduler') + self.logger = getLogger("frontera.contrib.scrapy.schedulers.FronteraScheduler") @classmethod def from_crawler(cls, crawler): @@ -93,7 +91,7 @@ def enqueue_request(self, request): self.frontier.add_seeds([request]) self.stats_manager.add_seeds() return True - elif self.redirect_enabled: + if self.redirect_enabled: self._add_pending_request(request) self.stats_manager.add_redirected_requests() return True @@ -112,10 +110,10 @@ def process_spider_output(self, response, result, spider): links.append(element) else: yield element - frontier_request = response.meta[b'frontier_request'] + frontier_request = response.meta[b"frontier_request"] self.frontier.page_crawled(response) # removed frontier part from .meta # putting it back, to persist .meta from original request - response.meta[b'frontier_request'] = frontier_request + response.meta[b"frontier_request"] = frontier_request self.frontier.links_extracted(response.request, links) self.stats_manager.add_crawled_page(response.status, len(links)) @@ -143,15 +141,20 @@ def has_pending_requests(self): return not self.frontier.finished() def _get_next_request(self): - if not self.frontier.manager.finished and \ - len(self) < self.crawler.engine.downloader.total_concurrency and \ - self._delay_next_call < time(): - + if ( + not self.frontier.manager.finished + and len(self) < self.crawler.engine.downloader.total_concurrency + and self._delay_next_call < time() + ): info = self._get_downloader_info() - requests = self.frontier.get_next_requests(key_type=info['key_type'], overused_keys=info['overused_keys']) + requests = self.frontier.get_next_requests( + key_type=info["key_type"], overused_keys=info["overused_keys"] + ) for request in requests: self._add_pending_request(request) - self._delay_next_call = time() + self._delay_on_empty if not requests else 0.0 + self._delay_next_call = ( + time() + self._delay_on_empty if not requests else 0.0 + ) return self._get_pending_request() def _add_pending_request(self, request): @@ -163,20 +166,22 @@ def _get_pending_request(self): def _get_exception_code(self, exception): try: return exception.__class__.__name__ - except: - return '?' + except Exception: + return "?" def _request_is_redirected(self, request): - return request.meta.get(b'redirect_times', 0) > 0 + return request.meta.get(b"redirect_times", 0) > 0 def _get_downloader_info(self): downloader = self.crawler.engine.downloader info = { - 'key_type': 'ip' if downloader.ip_concurrency else 'domain', - 'overused_keys': [] + "key_type": "ip" if downloader.ip_concurrency else "domain", + "overused_keys": [], } - for key, slot in six.iteritems(downloader.slots): + for key, slot in downloader.slots.items(): overused_factor = len(slot.active) / float(slot.concurrency) - if overused_factor > self.frontier.manager.settings.get('OVERUSED_SLOT_FACTOR'): - info['overused_keys'].append(key) + if overused_factor > self.frontier.manager.settings.get( + "OVERUSED_SLOT_FACTOR" + ): + info["overused_keys"].append(key) return info diff --git a/frontera/contrib/scrapy/schedulers/recording.py b/frontera/contrib/scrapy/schedulers/recording.py index bcaed25e3..cd979db80 100644 --- a/frontera/contrib/scrapy/schedulers/recording.py +++ b/frontera/contrib/scrapy/schedulers/recording.py @@ -1,9 +1,6 @@ -from __future__ import absolute_import -import pprint - +from scrapy import log from scrapy.core.scheduler import Scheduler from scrapy.http import Request -from scrapy import log from frontera import graphs @@ -12,35 +9,36 @@ DEFAULT_RECORDER_STORAGE_DROP_ALL_TABLES = True DEFAULT_RECORDER_STORAGE_CLEAR_CONTENT = True -STATS_PREFIX = 'recorder' +STATS_PREFIX = "recorder" -class StatsManager(object): +class StatsManager: """ - 'recorder/pages_count': xx, - 'recorder/seeds_count': xx, - 'recorder/links_count': xx, + 'recorder/pages_count': xx, + 'recorder/seeds_count': xx, + 'recorder/links_count': xx, """ + def __init__(self, stats, prefix=STATS_PREFIX): self.stats = stats self.prefix = prefix def add_page(self, is_seed=False): - self._inc_value('pages_count') + self._inc_value("pages_count") if is_seed: - self._inc_value('seeds_count') + self._inc_value("seeds_count") def remove_pages(self, count): - self._inc_value('pages_count', -count) + self._inc_value("pages_count", -count) def add_link(self): - self._inc_value('links_count') + self._inc_value("links_count") def remove_links(self, count): - self._inc_value('links_count', -count) + self._inc_value("links_count", -count) def _get_stats_name(self, variable): - return '%s/%s' % (self.prefix, variable) + return f"{self.prefix}/{variable}" def _inc_value(self, variable, count=1): self.stats.inc_value(self._get_stats_name(variable), count) @@ -50,44 +48,55 @@ def _set_value(self, variable, value): class RecorderScheduler(Scheduler): - def open(self, spider): - super(RecorderScheduler, self).open(spider) + super().open(spider) self.stats_manager = StatsManager(spider.crawler.stats) settings = spider.crawler.settings - self.recorder_enabled = settings.get('RECORDER_ENABLED', DEFAULT_RECORDER_ENABLED) + self.recorder_enabled = settings.get( + "RECORDER_ENABLED", DEFAULT_RECORDER_ENABLED + ) if not self.recorder_enabled: - log.msg('Recorder disabled!', log.WARNING) + log.msg("Recorder disabled!", log.WARNING) return - log.msg('Starting recorder', log.INFO) + log.msg("Starting recorder", log.INFO) - recorder_storage = settings.get('RECORDER_STORAGE_ENGINE', None) + recorder_storage = settings.get("RECORDER_STORAGE_ENGINE", None) if not recorder_storage: self.recorder_enabled = False - log.msg('Missing Recorder storage! Recorder disabled...', log.WARNING) + log.msg("Missing Recorder storage! Recorder disabled...", log.WARNING) return self.graph = graphs.Manager( engine=recorder_storage, - drop_all_tables=settings.getbool('RECORDER_STORAGE_DROP_ALL_TABLES', - DEFAULT_RECORDER_STORAGE_DROP_ALL_TABLES), - clear_content=settings.getbool('RECORDER_STORAGE_CLEAR_CONTENT', - DEFAULT_RECORDER_STORAGE_CLEAR_CONTENT)) + drop_all_tables=settings.getbool( + "RECORDER_STORAGE_DROP_ALL_TABLES", + DEFAULT_RECORDER_STORAGE_DROP_ALL_TABLES, + ), + clear_content=settings.getbool( + "RECORDER_STORAGE_CLEAR_CONTENT", DEFAULT_RECORDER_STORAGE_CLEAR_CONTENT + ), + ) def close(self, reason): - super(RecorderScheduler, self).close(reason) + super().close(reason) if self.recorder_enabled: - log.msg('Finishing recorder (%s)' % reason, log.INFO) + log.msg(f"Finishing recorder ({reason})", log.INFO) pages = self.graph.session.query(graphs.Page).filter_by(status=None).all() for page in pages: - n_deleted_links = self.graph.session.query(graphs.Relation).filter_by(child_id=page.id).delete() + n_deleted_links = ( + self.graph.session.query(graphs.Relation) + .filter_by(child_id=page.id) + .delete() + ) if n_deleted_links: self.stats_manager.remove_links(n_deleted_links) - n_deleted_pages = self.graph.session.query(graphs.Page).filter_by(status=None).delete() + n_deleted_pages = ( + self.graph.session.query(graphs.Page).filter_by(status=None).delete() + ) if n_deleted_pages: self.stats_manager.remove_pages(n_deleted_pages) self.graph.save() @@ -98,51 +107,52 @@ def enqueue_request(self, request): return dqok = self._dqpush(request) if dqok: - self.stats.inc_value('scheduler/enqueued/disk', spider=self.spider) + self.stats.inc_value("scheduler/enqueued/disk", spider=self.spider) else: self._mqpush(request) - self.stats.inc_value('scheduler/enqueued/memory', spider=self.spider) - self.stats.inc_value('scheduler/enqueued', spider=self.spider) + self.stats.inc_value("scheduler/enqueued/memory", spider=self.spider) + self.stats.inc_value("scheduler/enqueued", spider=self.spider) if self.recorder_enabled: - is_seed = b'rule' not in request.meta and \ - b'origin_is_recorder' not in request.meta + is_seed = ( + b"rule" not in request.meta + and b"origin_is_recorder" not in request.meta + ) page = self.graph.add_page(url=request.url, is_seed=is_seed) self.stats_manager.add_page(is_seed) - request.meta[b'is_seed'] = is_seed - request.meta[b'page'] = page + request.meta[b"is_seed"] = is_seed + request.meta[b"page"] = page def next_request(self): - request = super(RecorderScheduler, self).next_request() + request = super().next_request() if self.recorder_enabled and request: - request.meta[b'origin_is_recorder'] = True + request.meta[b"origin_is_recorder"] = True return request def process_spider_output(self, response, result, spider): if not self.recorder_enabled: - for r in result: - yield r + yield from result return - page = response.meta[b'page'] + page = response.meta[b"page"] page.status = response.status self.graph.save() requests = [r for r in result if isinstance(r, Request)] for request in requests: link = self.graph.add_link(page=page, url=request.url) - request.meta[b'page'] = link - request.meta[b'referer'] = page + request.meta[b"page"] = link + request.meta[b"referer"] = page self.stats_manager.add_link() yield request def process_exception(self, request, exception, spider): if self.recorder_enabled: error_code = self._get_exception_code(exception) - page = request.meta[b'page'] + page = request.meta[b"page"] page.status = error_code self.graph.save() def _get_exception_code(self, exception): try: return exception.__class__.__name__ - except: - return '?' + except Exception: + return "?" diff --git a/frontera/contrib/scrapy/settings_adapter.py b/frontera/contrib/scrapy/settings_adapter.py index a040d07e6..0e6e6d4ef 100644 --- a/frontera/contrib/scrapy/settings_adapter.py +++ b/frontera/contrib/scrapy/settings_adapter.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import from frontera.settings import BaseSettings, DefaultSettings @@ -6,14 +5,15 @@ class ScrapySettingsAdapter(BaseSettings): """ Wrapps the frontera settings, falling back to scrapy and default settings """ + def __init__(self, crawler_settings): - frontera_settings = crawler_settings.get('FRONTERA_SETTINGS', None) - super(ScrapySettingsAdapter, self).__init__(module=frontera_settings) + frontera_settings = crawler_settings.get("FRONTERA_SETTINGS", None) + super().__init__(module=frontera_settings) self._crawler_settings = crawler_settings or {} self._default_settings = DefaultSettings() def get(self, key, default_value=None): - val = super(ScrapySettingsAdapter, self).get(key) + val = super().get(key) if val is not None: return val diff --git a/frontera/core/__init__.py b/frontera/core/__init__.py index 7623db480..c30b7b8c8 100644 --- a/frontera/core/__init__.py +++ b/frontera/core/__init__.py @@ -1,8 +1,6 @@ -from __future__ import absolute_import -from six.moves.urllib.parse import urlparse -from socket import getaddrinfo from collections import defaultdict, deque -import six +from socket import getaddrinfo +from urllib.parse import urlparse def get_slot_key(request, type): # TODO: Probably use caching here @@ -15,19 +13,20 @@ def get_slot_key(request, type): # TODO: Probably use caching here :param str type: either 'domain'(default) or 'ip'. :return: string """ - key = urlparse(request.url).hostname or '' - if type == 'ip': + key = urlparse(request.url).hostname or "" + if type == "ip": for result in getaddrinfo(key, 80): key = result[4][0] break return key -class OverusedBuffer(object): +class OverusedBuffer: """ A buffering object for implementing the buffer of Frontera requests for overused domains/ips. It can be used when customizing backend to address efficient downloader pool usage. """ + def __init__(self, _get_func, log_func=None): """ :param _get_func: reference to get_next_requests() method of binded class @@ -41,7 +40,7 @@ def _get_key(self, request, type): return get_slot_key(request, type) def _get_pending_count(self): - return sum(six.moves.map(len, six.itervalues(self._pending))) + return sum(map(len, self._pending.values())) def _get_pending(self, max_n_requests, overused_set): pending = self._pending @@ -49,26 +48,26 @@ def _get_pending(self, max_n_requests, overused_set): while i < max_n_requests and keys: for key in keys.copy(): - try: + if pending[key]: yield pending[key].popleft() i += 1 - except IndexError: + else: keys.discard(key) del pending[key] def get_next_requests(self, max_n_requests, **kwargs): if self._log: - self._log("Overused keys: %s" % str(kwargs['overused_keys'])) - self._log("Pending: %d" % self._get_pending_count()) + self._log(f"Overused keys: {kwargs['overused_keys']!s}") + self._log(f"Pending: {self._get_pending_count()}") - overused_set = set(kwargs['overused_keys']) + overused_set = set(kwargs["overused_keys"]) requests = list(self._get_pending(max_n_requests, overused_set)) if len(requests) == max_n_requests: return requests - for request in self._get(max_n_requests-len(requests), **kwargs): - key = self._get_key(request, kwargs['key_type']) + for request in self._get(max_n_requests - len(requests), **kwargs): + key = self._get_key(request, kwargs["key_type"]) if key in overused_set: self._pending[key].append(request) else: diff --git a/frontera/core/codec.py b/frontera/core/codec.py index 45f6e0068..64413e27b 100644 --- a/frontera/core/codec.py +++ b/frontera/core/codec.py @@ -1,12 +1,7 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import from abc import ABCMeta, abstractmethod -import six -@six.add_metaclass(ABCMeta) -class BaseDecoder(object): - +class BaseDecoder(metaclass=ABCMeta): @abstractmethod def decode(self, buffer): """ @@ -15,7 +10,6 @@ def decode(self, buffer): :param bytes buffer: encoded message :return: tuple of message type and related objects """ - pass @abstractmethod def decode_request(self, buffer): @@ -25,12 +19,9 @@ def decode_request(self, buffer): :param bytes buffer: serialized string :return: object Request """ - pass - -@six.add_metaclass(ABCMeta) -class BaseEncoder(object): +class BaseEncoder(metaclass=ABCMeta): @abstractmethod def encode_add_seeds(self, seeds): """ @@ -39,7 +30,6 @@ def encode_add_seeds(self, seeds): :param list seeds: A list of frontier Request objects :return: bytes encoded message """ - pass @abstractmethod def encode_page_crawled(self, response): @@ -50,7 +40,6 @@ def encode_page_crawled(self, response): :return: bytes encoded message """ - pass @abstractmethod def encode_links_extracted(self, request, links): @@ -62,7 +51,6 @@ def encode_links_extracted(self, request, links): :return: bytes encoded message """ - pass @abstractmethod def encode_request_error(self, request, error): @@ -74,7 +62,6 @@ def encode_request_error(self, request, error): :return: bytes encoded message """ - pass @abstractmethod def encode_request(self, request): @@ -84,7 +71,6 @@ def encode_request(self, request): :param object request: Frontera Request object :return: bytes encoded message """ - pass @abstractmethod def encode_update_score(self, request, score, schedule): @@ -96,7 +82,6 @@ def encode_update_score(self, request, score, schedule): :param bool schedule: True if document needs to be scheduled for download :return: bytes encoded message """ - pass @abstractmethod def encode_new_job_id(self, job_id): @@ -106,7 +91,6 @@ def encode_new_job_id(self, job_id): :param int job_id: :return: bytes encoded message """ - pass @abstractmethod def encode_offset(self, partition_id, offset): @@ -117,4 +101,3 @@ def encode_offset(self, partition_id, offset): :param int offset: :return: bytes encoded message """ - pass diff --git a/frontera/core/components.py b/frontera/core/components.py index 33529c7bc..7334af54b 100644 --- a/frontera/core/components.py +++ b/frontera/core/components.py @@ -1,24 +1,19 @@ -from __future__ import absolute_import from abc import ABCMeta, abstractmethod, abstractproperty -import six -class StartStopMixin(object): +class StartStopMixin: def frontier_start(self): """ Called when the frontier starts, see :ref:`starting/stopping the frontier `. """ - pass def frontier_stop(self): """ Called when the frontier stops, see :ref:`starting/stopping the frontier `. """ - pass -@six.add_metaclass(ABCMeta) -class Metadata(StartStopMixin): +class Metadata(StartStopMixin, metaclass=ABCMeta): """Interface definition for a frontier metadata class. This class is responsible for storing documents metadata, including content and optimized for write-only data flow.""" @@ -29,7 +24,6 @@ def add_seeds(self, seeds): :param list seeds: A list of :class:`Request ` objects. """ - pass @abstractmethod def page_crawled(self, response): @@ -38,7 +32,6 @@ def page_crawled(self, response): :param object response: The :class:`Response ` object for the crawled page. """ - pass @abstractmethod def links_extracted(self, request, links): @@ -48,7 +41,6 @@ def links_extracted(self, request, links): :param object request: The original :class:`Request ` object for the crawled page. :param list links: A list of :class:`Request ` objects containing extracted links. """ - pass @abstractmethod def request_error(self, page, error): @@ -58,11 +50,9 @@ def request_error(self, page, error): :param object request: The crawled with error :class:`Request ` object. :param string error: A string identifier for the error. """ - pass -@six.add_metaclass(ABCMeta) -class Queue(StartStopMixin): +class Queue(StartStopMixin, metaclass=ABCMeta): """Interface definition for a frontier queue class. The queue has priorities and partitions.""" @abstractmethod @@ -97,8 +87,7 @@ def count(self): raise NotImplementedError -@six.add_metaclass(ABCMeta) -class States(StartStopMixin): +class States(StartStopMixin, metaclass=ABCMeta): """Interface definition for a document states management class. This class is responsible for providing actual documents state, and persist the state changes in batch-oriented manner.""" @@ -144,8 +133,7 @@ def fetch(self, fingerprints): raise NotImplementedError -@six.add_metaclass(ABCMeta) -class Component(Metadata): +class Component(Metadata, metaclass=ABCMeta): """ Interface definition for a frontier component The :class:`Component ` object is the base class for frontier @@ -160,7 +148,8 @@ class Component(Metadata): but in their corresponding section. """ - component_name = 'Base Component' + + component_name = "Base Component" @property def name(self): @@ -184,19 +173,19 @@ def from_manager(cls, manager): return cls() -@six.add_metaclass(ABCMeta) -class Middleware(Component): +class Middleware(Component, metaclass=ABCMeta): """Interface definition for a Frontier Middlewares""" - component_name = 'Base Middleware' + + component_name = "Base Middleware" -@six.add_metaclass(ABCMeta) -class CanonicalSolver(Middleware): +class CanonicalSolver(Middleware, metaclass=ABCMeta): """Interface definition for a Frontera Canonical Solver""" - component_name = 'Base CanonicalSolver' + + component_name = "Base CanonicalSolver" -class PropertiesMixin(object): +class PropertiesMixin: @abstractproperty def queue(self): """ @@ -219,8 +208,7 @@ def states(self): raise NotImplementedError -@six.add_metaclass(ABCMeta) -class Backend(PropertiesMixin, Component): +class Backend(PropertiesMixin, Component, metaclass=ABCMeta): """Interface definition for frontier backend.""" @abstractmethod @@ -245,8 +233,7 @@ def get_next_requests(self, max_n_requests, **kwargs): raise NotImplementedError -@six.add_metaclass(ABCMeta) -class DistributedBackend(Backend): +class DistributedBackend(Backend, metaclass=ABCMeta): """Interface definition for distributed frontier backend. Implies using in strategy worker and DB worker.""" @classmethod @@ -258,10 +245,11 @@ def db_worker(cls, manager): raise NotImplementedError -class Partitioner(object): +class Partitioner: """ Base class for a partitioner """ + def __init__(self, partitions): """ Initialize the partitioner @@ -280,6 +268,4 @@ def partition(self, key, partitions=None): key: the key to use for partitioning partitions: (optional) a list of partitions. """ - raise NotImplementedError('partition function has to be implemented') - - + raise NotImplementedError("partition function has to be implemented") diff --git a/frontera/core/manager.py b/frontera/core/manager.py index c83bb780b..d0a31f355 100644 --- a/frontera/core/manager.py +++ b/frontera/core/manager.py @@ -1,26 +1,39 @@ -from __future__ import absolute_import -from collections import OrderedDict +import logging +from frontera.core import models +from frontera.core.components import ( + Backend, + CanonicalSolver, + DistributedBackend, + Middleware, +) from frontera.exceptions import NotConfigured +from frontera.settings import Settings from frontera.utils.misc import load_object -from frontera.settings import Settings, BaseSettings -from frontera.core.components import Backend, DistributedBackend, Middleware, CanonicalSolver -from frontera.core import models -import logging -class ComponentsPipelineMixin(object): - def __init__(self, backend, middlewares=None, canonicalsolver=None, db_worker=False, strategy_worker=False): +class ComponentsPipelineMixin: + def __init__( + self, + backend, + middlewares=None, + canonicalsolver=None, + db_worker=False, + strategy_worker=False, + ): self._logger_components = logging.getLogger("manager.components") # Load middlewares self._middlewares = self._load_middlewares(middlewares) # Load canonical solver - self._logger_components.debug("Loading canonical url solver '%s'", canonicalsolver) + self._logger_components.debug( + "Loading canonical url solver '%s'", canonicalsolver + ) self._canonicalsolver = self._load_object(canonicalsolver) - assert isinstance(self.canonicalsolver, CanonicalSolver), \ - "canonical solver '%s' must subclass CanonicalSolver" % self.canonicalsolver.__class__.__name__ + assert isinstance(self.canonicalsolver, CanonicalSolver), ( + f"canonical solver '{self.canonicalsolver.__class__.__name__}' must subclass CanonicalSolver" + ) # Load backend self._logger_components.debug("Loading backend '%s'", backend) @@ -51,20 +64,22 @@ def backend(self): def _load_backend(self, backend, db_worker, strategy_worker): cls = load_object(backend) - assert issubclass(cls, Backend), "backend '%s' must subclass Backend" % cls.__name__ + assert issubclass(cls, Backend), ( + f"backend '{cls.__name__}' must subclass Backend" + ) if issubclass(cls, DistributedBackend): if db_worker: return cls.db_worker(self) if strategy_worker: return cls.strategy_worker(self) raise RuntimeError("Distributed backends are meant to be used in workers.") - else: - assert not strategy_worker, "In order to distribute backend only DistributedBackend " \ - "subclasses are allowed to use." - if hasattr(cls, 'from_manager'): + assert not strategy_worker, ( + "In order to distribute backend only DistributedBackend " + "subclasses are allowed to use." + ) + if hasattr(cls, "from_manager"): return cls.from_manager(self) - else: - return cls() + return cls() def _load_middlewares(self, middleware_names): # TO-DO: Use dict for middleware ordering @@ -73,7 +88,9 @@ def _load_middlewares(self, middleware_names): self._logger_components.debug("Loading middleware '%s'", mw_name) try: mw = self._load_object(mw_name, silent=False) - assert isinstance(mw, Middleware), "middleware '%s' must subclass Middleware" % mw.__class__.__name__ + assert isinstance(mw, Middleware), ( + f"middleware '{mw.__class__.__name__}' must subclass Middleware" + ) if mw: mws.append(mw) except NotConfigured: @@ -83,37 +100,58 @@ def _load_middlewares(self, middleware_names): def _process_components(self, method_name, obj=None, return_classes=None, **kwargs): return_obj = obj - for component_category, component, check_response in self._components_pipeline: - components = component if isinstance(component, list) else [component] - for component in components: - result = self._process_component(component=component, method_name=method_name, - component_category=component_category, obj=return_obj, - return_classes=return_classes, **kwargs) + for component_category, components, check_response in self._components_pipeline: + component_list = ( + components if isinstance(components, list) else [components] + ) + for component in component_list: + result = self._process_component( + component=component, + method_name=method_name, + component_category=component_category, + obj=return_obj, + return_classes=return_classes, + **kwargs, + ) if check_response: return_obj = result if check_response and obj and not return_obj: - self._logger_components.warning("Object '%s' filtered in '%s' by '%s'", - obj.__class__.__name__, method_name, component.__class__.__name__) - return + self._logger_components.warning( + "Object '%s' filtered in '%s' by '%s'", + obj.__class__.__name__, + method_name, + component.__class__.__name__, + ) + return None return return_obj - def _process_component(self, component, method_name, component_category, obj, return_classes, **kwargs): - self._logger_components.debug("processing '%s' '%s.%s' %s", - method_name, component_category, component.__class__.__name__, obj) + def _process_component( + self, component, method_name, component_category, obj, return_classes, **kwargs + ): + self._logger_components.debug( + "processing '%s' '%s.%s' %s", + method_name, + component_category, + component.__class__.__name__, + obj, + ) return_obj = getattr(component, method_name)(*([obj] if obj else []), **kwargs) - assert return_obj is None or isinstance(return_obj, return_classes), \ - "%s '%s.%s' must return None or %s, Got '%s'" % \ - (component_category, obj.__class__.__name__, method_name, - ' or '.join(c.__name__ for c in return_classes) - if isinstance(return_classes, tuple) else - return_classes.__name__, - return_obj.__class__.__name__) + assert return_obj is None or isinstance(return_obj, return_classes), ( + "{} '{}.{}' must return None or {}, Got '{}'".format( + component_category, + obj.__class__.__name__, + method_name, + " or ".join(c.__name__ for c in return_classes) + if isinstance(return_classes, tuple) + else return_classes.__name__, + return_obj.__class__.__name__, + ) + ) return return_obj -class BaseManager(object): +class BaseManager: def __init__(self, request_model, response_model, settings=None): - # Settings self._settings = settings or Settings() @@ -121,25 +159,29 @@ def __init__(self, request_model, response_model, settings=None): self._logger = logging.getLogger("manager") # Log frontier manager starting - self._logger.info('-'*80) - self._logger.info('Starting Frontier Manager...') + self._logger.info("-" * 80) + self._logger.info("Starting Frontier Manager...") # Load request model self._request_model = load_object(request_model) - assert issubclass(self._request_model, models.Request), "Request model '%s' must subclass 'Request'" % \ - self._request_model.__name__ + assert issubclass(self._request_model, models.Request), ( + f"Request model '{self._request_model.__name__}' must subclass 'Request'" + ) # Load response model self._response_model = load_object(response_model) - assert issubclass(self._response_model, models.Response), "Response model '%s' must subclass 'Response'" % \ - self._response_model.__name__ + assert issubclass(self._response_model, models.Response), ( + f"Response model '{self._response_model.__name__}' must subclass 'Response'" + ) @classmethod def from_settings(cls, settings=None): manager_settings = Settings(settings) - return BaseManager(request_model=manager_settings.REQUEST_MODEL, - response_model=manager_settings.RESPONSE_MODEL, - settings=manager_settings) + return BaseManager( + request_model=manager_settings.REQUEST_MODEL, + response_model=manager_settings.RESPONSE_MODEL, + settings=manager_settings, + ) def _load_object(self, obj_class_name, silent=False): obj_class = load_object(obj_class_name) @@ -147,13 +189,12 @@ def _load_object(self, obj_class_name, silent=False): return self._load_frontier_object(obj_class) except NotConfigured: if not silent: - raise NotConfigured + raise def _load_frontier_object(self, obj_class): - if hasattr(obj_class, 'from_manager'): + if hasattr(obj_class, "from_manager"): return obj_class.from_manager(self) - else: - return obj_class() + return obj_class() @property def request_model(self): @@ -185,9 +226,22 @@ class FrontierManager(BaseManager, ComponentsPipelineMixin): providing an API to interact with. It's also responsible of loading and communicating all different frontier components. """ - def __init__(self, request_model, response_model, backend, middlewares=None, test_mode=False, max_requests=0, - max_next_requests=0, auto_start=True, settings=None, canonicalsolver=None, db_worker=False, - strategy_worker=False): + + def __init__( + self, + request_model, + response_model, + backend, + middlewares=None, + test_mode=False, + max_requests=0, + max_next_requests=0, + auto_start=True, + settings=None, + canonicalsolver=None, + db_worker=False, + strategy_worker=False, + ): """ :param object/string request_model: The :class:`Request ` object to be \ used by the frontier. @@ -227,7 +281,7 @@ def __init__(self, request_model, response_model, backend, middlewares=None, tes # Test mode self._test_mode = test_mode - self._logger.debug('Test mode %s' % ("ENABLED" if self.test_mode else "DISABLED")) + self._logger.debug(f"Test mode {'ENABLED' if self.test_mode else 'DISABLED'}") # Page counters self._max_requests = max_requests @@ -240,20 +294,25 @@ def __init__(self, request_model, response_model, backend, middlewares=None, tes # Manager finished flag self._finished = False - ComponentsPipelineMixin.__init__(self, backend=backend, middlewares=middlewares, - canonicalsolver=canonicalsolver, db_worker=db_worker, - strategy_worker=strategy_worker) + ComponentsPipelineMixin.__init__( + self, + backend=backend, + middlewares=middlewares, + canonicalsolver=canonicalsolver, + db_worker=db_worker, + strategy_worker=strategy_worker, + ) # Init frontier components pipeline self._components_pipeline = [ - ('Middleware', self.middlewares, True), - ('CanonicalSolver', self.canonicalsolver, False), - ('Backend', self.backend, False) + ("Middleware", self.middlewares, True), + ("CanonicalSolver", self.canonicalsolver, False), + ("Backend", self.backend, False), ] # Log frontier manager start - self._logger.info('Frontier Manager Started!') - self._logger.info('-'*80) + self._logger.info("Frontier Manager Started!") + self._logger.info("-" * 80) # start/stop self._started = False @@ -270,18 +329,20 @@ def from_settings(cls, settings=None, db_worker=False, strategy_worker=False): :ref:`frontier default settings ` are used. """ manager_settings = Settings.object_from(settings) - return FrontierManager(request_model=manager_settings.REQUEST_MODEL, - response_model=manager_settings.RESPONSE_MODEL, - backend=manager_settings.BACKEND, - middlewares=manager_settings.MIDDLEWARES, - test_mode=manager_settings.TEST_MODE, - max_requests=manager_settings.MAX_REQUESTS, - max_next_requests=manager_settings.MAX_NEXT_REQUESTS, - auto_start=manager_settings.AUTO_START, - settings=manager_settings, - canonicalsolver=manager_settings.CANONICAL_SOLVER, - db_worker=db_worker, - strategy_worker=strategy_worker) + return FrontierManager( + request_model=manager_settings.REQUEST_MODEL, + response_model=manager_settings.RESPONSE_MODEL, + backend=manager_settings.BACKEND, + middlewares=manager_settings.MIDDLEWARES, + test_mode=manager_settings.TEST_MODE, + max_requests=manager_settings.MAX_REQUESTS, + max_next_requests=manager_settings.MAX_NEXT_REQUESTS, + auto_start=manager_settings.AUTO_START, + settings=manager_settings, + canonicalsolver=manager_settings.CANONICAL_SOLVER, + db_worker=db_worker, + strategy_worker=strategy_worker, + ) @property def test_mode(self): @@ -347,9 +408,9 @@ def start(self): :return: None. """ - assert not self._started, 'Frontier already started!' - self._logger.debug('START') - self._process_components(method_name='frontier_start') + assert not self._started, "Frontier already started!" + self._logger.debug("START") + self._process_components(method_name="frontier_start") self._started = True def stop(self): @@ -360,8 +421,8 @@ def stop(self): :return: None. """ self._check_startstop() - self._logger.debug('STOP') - self._process_components(method_name='frontier_stop') + self._logger.debug("STOP") + self._process_components(method_name="frontier_stop") self._stopped = True def add_seeds(self, seeds): @@ -376,12 +437,13 @@ def add_seeds(self, seeds): # FIXME probably seeds should be a generator here assert len(seeds), "Empty seeds list" for seed in seeds: - assert isinstance(seed, self._request_model), "Seed objects must subclass '%s', '%s' found" % \ - (self._request_model.__name__, type(seed).__name__) - self._logger.debug('ADD_SEEDS urls_length=%d', len(seeds)) - self._process_components(method_name='add_seeds', - obj=seeds, - return_classes=(list,)) # TODO: Dar vuelta + assert isinstance(seed, self._request_model), ( + f"Seed objects must subclass '{self._request_model.__name__}', '{type(seed).__name__}' found" + ) + self._logger.debug("ADD_SEEDS urls_length=%d", len(seeds)) + self._process_components( + method_name="add_seeds", obj=seeds, return_classes=(list,) + ) # TODO: Dar vuelta def get_next_requests(self, max_next_requests=0, **kwargs): """ @@ -399,22 +461,27 @@ def get_next_requests(self, max_next_requests=0, **kwargs): # End condition check if self.max_requests and self.n_requests >= self.max_requests: - self._logger.info('MAX PAGES REACHED! (%s/%s)', self.n_requests, self.max_requests) + self._logger.info( + "MAX PAGES REACHED! (%s/%s)", self.n_requests, self.max_requests + ) self._finished = True return [] # Calculate number of requests max_next_requests = max_next_requests or self.max_next_requests - if self.max_requests: - if not max_next_requests: - max_next_requests = self.max_requests - self.n_requests - else: - if self.n_requests+max_next_requests > self.max_requests: - max_next_requests = self.max_requests - self.n_requests + if self.max_requests and ( + not max_next_requests + or self.n_requests + max_next_requests > self.max_requests + ): + max_next_requests = self.max_requests - self.n_requests # log (in) - self._logger.debug('GET_NEXT_REQUESTS(in) max_next_requests=%s n_requests=%s/%s', - max_next_requests, self.n_requests, self.max_requests or '-') + self._logger.debug( + "GET_NEXT_REQUESTS(in) max_next_requests=%s n_requests=%s/%s", + max_next_requests, + self.n_requests, + self.max_requests or "-", + ) # get next requests next_requests = self.backend.get_next_requests(max_next_requests, **kwargs) @@ -427,8 +494,12 @@ def get_next_requests(self, max_next_requests=0, **kwargs): self._iteration += 1 # log (out) - self._logger.debug('GET_NEXT_REQUESTS(out) returned_requests=%s n_requests=%s/%s', - len(next_requests), self.n_requests, self.max_requests or '-') + self._logger.debug( + "GET_NEXT_REQUESTS(out) returned_requests=%s n_requests=%s/%s", + len(next_requests), + self.n_requests, + self.max_requests or "-", + ) return next_requests def page_crawled(self, response): @@ -440,19 +511,25 @@ def page_crawled(self, response): :return: None. """ self._check_startstop() - self._logger.debug('PAGE_CRAWLED url=%s status=%s', response.url, response.status_code) - assert isinstance(response, self.response_model), "Response object must subclass '%s', '%s' found" % \ - (self.response_model.__name__, type(response).__name__) - assert hasattr(response, 'request') and response.request, "Empty response request" - assert isinstance(response.request, self.request_model), "Response request object must subclass '%s', " \ - "'%s' found" % \ - (self.request_model.__name__, - type(response.request).__name__) - assert isinstance(response, self.response_model), "Response object must subclass '%s', '%s' found" % \ - (self.response_model.__name__, type(response).__name__) - self._process_components(method_name='page_crawled', - obj=response, - return_classes=self.response_model) + self._logger.debug( + "PAGE_CRAWLED url=%s status=%s", response.url, response.status_code + ) + assert isinstance(response, self.response_model), ( + f"Response object must subclass '{self.response_model.__name__}', '{type(response).__name__}' found" + ) + assert hasattr(response, "request") and response.request, ( + "Empty response request" + ) + assert isinstance(response.request, self.request_model), ( + f"Response request object must subclass '{self.request_model.__name__}', " + f"'{type(response.request).__name__}' found" + ) + assert isinstance(response, self.response_model), ( + f"Response object must subclass '{self.response_model.__name__}', '{type(response).__name__}' found" + ) + self._process_components( + method_name="page_crawled", obj=response, return_classes=self.response_model + ) def links_extracted(self, request, links): """ @@ -465,16 +542,20 @@ def links_extracted(self, request, links): :return: None. """ self._check_startstop() - self._logger.debug('LINKS_EXTRACTED url=%s links=%d', request.url, len(links)) - assert isinstance(request, self.request_model), "Request object must subclass '%s', '%s' found" % \ - (self.request_model.__name__, type(request).__name__) + self._logger.debug("LINKS_EXTRACTED url=%s links=%d", request.url, len(links)) + assert isinstance(request, self.request_model), ( + f"Request object must subclass '{self.request_model.__name__}', '{type(request).__name__}' found" + ) for link in links: - assert isinstance(link, self._request_model), "Link objects must subclass '%s', '%s' found" % \ - (self._request_model.__name__, type(link).__name__) - self._process_components(method_name='links_extracted', - obj=request, - return_classes=self.request_model, - links=links) + assert isinstance(link, self._request_model), ( + f"Link objects must subclass '{self._request_model.__name__}', '{type(link).__name__}' found" + ) + self._process_components( + method_name="links_extracted", + obj=request, + return_classes=self.request_model, + links=links, + ) def request_error(self, request, error): """ @@ -486,12 +567,13 @@ def request_error(self, request, error): :return: None. """ self._check_startstop() - self._logger.debug('PAGE_REQUEST_ERROR url=%s error=%s', request.url, error) - processed_page = self._process_components(method_name='request_error', - obj=request, - return_classes=self.request_model, - error=error) - return processed_page + self._logger.debug("PAGE_REQUEST_ERROR url=%s error=%s", request.url, error) + return self._process_components( + method_name="request_error", + obj=request, + return_classes=self.request_model, + error=error, + ) def _check_startstop(self): assert self._started, "Frontier not started!" diff --git a/frontera/core/messagebus.py b/frontera/core/messagebus.py index 3782f6c00..0c3b997c1 100644 --- a/frontera/core/messagebus.py +++ b/frontera/core/messagebus.py @@ -1,12 +1,7 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import from abc import ABCMeta, abstractmethod -import six -@six.add_metaclass(ABCMeta) -class BaseStreamConsumer(object): - +class BaseStreamConsumer(metaclass=ABCMeta): @abstractmethod def get_messages(self, timeout=0.1, count=1): """ @@ -17,7 +12,6 @@ def get_messages(self, timeout=0.1, count=1): :param count: int, number of messages :return: generator with raw messages """ - raise NotImplementedError @abstractmethod def get_offset(self, partition_id): @@ -27,19 +21,15 @@ def get_offset(self, partition_id): :param partition_id: int :return: int consumer offset """ - raise NotImplementedError - def close(self): + def close(self): # noqa: B027 """ Performs necessary cleanup and closes consumer. :return: none """ - pass - -@six.add_metaclass(ABCMeta) -class BaseStreamProducer(object): +class BaseStreamProducer(metaclass=ABCMeta): @abstractmethod def send(self, key, *messages): """ @@ -47,7 +37,6 @@ def send(self, key, *messages): :param key: str key used for partitioning, None for non-keyed channels :param *messages: encoded message(s) """ - raise NotImplementedError @abstractmethod def flush(self): @@ -55,8 +44,8 @@ def flush(self): Flushes all internal buffers. :return: nothing """ - raise NotImplementedError + @abstractmethod def get_offset(self, partition_id): """ Returns producer offset for partition. Raises KeyError, if partition isn't available or doesn't exist. @@ -65,18 +54,15 @@ def get_offset(self, partition_id): :param partition_id: int :return: int producer offset """ - raise NotImplementedError - def close(self): + def close(self): # noqa: B027 """ Performs all necessary cleanup and closes the producer. :return: none """ - pass -@six.add_metaclass(ABCMeta) -class BaseSpiderLogStream(object): +class BaseSpiderLogStream(metaclass=ABCMeta): """ Spider Log Stream base class. This stream transfers results from spiders to Strategy and DB workers. Any producer can write to any partition of this stream. Consumers can be bound to specific partition (SW worker) or not @@ -102,8 +88,7 @@ def consumer(self, partition_id, type): raise NotImplementedError -@six.add_metaclass(ABCMeta) -class BaseScoringLogStream(object): +class BaseScoringLogStream(metaclass=ABCMeta): """ Scoring log stream base class. This stream is transfering score and scheduling information from Strategy workers to DB Workers. This type of stream isn't requiring any partitioning. @@ -124,8 +109,7 @@ def producer(self): raise NotImplementedError -@six.add_metaclass(ABCMeta) -class BaseSpiderFeedStream(object): +class BaseSpiderFeedStream(metaclass=ABCMeta): """ Spider Feed Stream base class. This stream transfers new batches from DB worker to spiders. Every consumer is strictly bounded to specific partition, and producer could write to any partition. This class also has methods @@ -139,7 +123,6 @@ def consumer(self, partition_id): :param partition_id: int :return: BaseStreamConsumer instance assigned to given partition_id """ - raise NotImplementedError @abstractmethod def producer(self): @@ -148,7 +131,6 @@ def producer(self): (separating feed by hosts, so each host will be downloaded by at most one spider). :return: BaseStreamProducer instance """ - raise NotImplementedError @abstractmethod def available_partitions(self): @@ -156,27 +138,25 @@ def available_partitions(self): Returns the iterable of available (ready for processing new batches) partitions. :return: iterable of ints """ - raise NotImplementedError + @abstractmethod def mark_ready(self, partition_id): """ Marks partition as ready/available for receiving new batches. :param partition_id: int :return: nothing """ - pass + @abstractmethod def mark_busy(self, partition_id): """ Marks partition as busy, so that spider assigned to this partition is busy processing previous batches. :param partition_id: int :return: nothing """ - pass -@six.add_metaclass(ABCMeta) -class BaseMessageBus(object): +class BaseMessageBus(metaclass=ABCMeta): """ Main message bus class, encapsulating message bus context. Serving as a factory for stream-specific objects. """ diff --git a/frontera/core/models.py b/frontera/core/models.py index c1c8de734..596fee82b 100644 --- a/frontera/core/models.py +++ b/frontera/core/models.py @@ -1,10 +1,9 @@ -from __future__ import absolute_import import copy -from w3lib.util import to_bytes, to_native_str -from w3lib.url import safe_url_string +from w3lib.util import to_bytes, to_unicode -class FrontierObject(object): + +class FrontierObject: def copy(self): return copy.copy(self) @@ -16,7 +15,10 @@ class Request(FrontierObject): :class:`Response ` object when crawled. """ - def __init__(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=''): + + def __init__( + self, url, method=b"GET", headers=None, cookies=None, meta=None, body="" + ): """ :param string url: URL to send. :param string method: HTTP method to use. @@ -25,11 +27,11 @@ def __init__(self, url, method=b'GET', headers=None, cookies=None, meta=None, bo :param dict meta: dictionary that contains arbitrary metadata for this request, the keys must be bytes and \ the values must be either bytes or serializable objects such as lists, tuples, dictionaries with byte type items. """ - self._url = to_native_str(url) - self._method = to_bytes((method or b'GET').upper()) + self._url = to_unicode(url) + self._method = to_bytes((method or b"GET").upper()) self._headers = headers or {} self._cookies = cookies or {} - self._meta = meta or {b'scrapy_meta': {}} + self._meta = meta or {b"scrapy_meta": {}} self._body = body @property @@ -79,9 +81,7 @@ def body(self): return self._body def __str__(self): - return "<%s at 0x%0x %s meta=%s body=%s... cookies=%s, headers=%s>" % (type(self).__name__, id(self), self.url, - str(self.meta), str(self.body[:20]), - str(self.cookies), str(self.headers)) + return f"<{type(self).__name__} at 0x{id(self):0x} {self.url} meta={self.meta!s} body={self.body[:20]!s}... cookies={self.cookies!s}, headers={self.headers!s}>" __repr__ = __str__ @@ -93,7 +93,7 @@ class Response(FrontierObject): """ - def __init__(self, url, status_code=200, headers=None, body='', request=None): + def __init__(self, url, status_code=200, headers=None, body="", request=None): """ :param string url: URL of this response. :param int status_code: the HTTP status of the response. Defaults to 200. @@ -102,7 +102,7 @@ def __init__(self, url, status_code=200, headers=None, body='', request=None): :param Request request: The Request object that generated this response. """ - self._url = to_native_str(url) + self._url = to_unicode(url) self._status_code = int(status_code) self._headers = headers or {} self._body = body @@ -151,14 +151,12 @@ def meta(self): """ try: return self.request.meta - except AttributeError: - raise AttributeError("Response.meta not available, this response " - "is not tied to any request") + except AttributeError as e: + raise AttributeError( + "Response.meta not available, this response is not tied to any request" + ) from e def __str__(self): - return "<%s at 0x%0x %s %s meta=%s body=%s... headers=%s>" % (type(self).__name__, - id(self), self.status_code, - self.url, str(self.meta), - str(self.body[:20]), str(self.headers)) + return f"<{type(self).__name__} at 0x{id(self):0x} {self.status_code} {self.url} meta={self.meta!s} body={self.body[:20]!s}... headers={self.headers!s}>" __repr__ = __str__ diff --git a/frontera/exceptions.py b/frontera/exceptions.py index d33d0d3fd..bfc1b1f03 100644 --- a/frontera/exceptions.py +++ b/frontera/exceptions.py @@ -1,3 +1,2 @@ class NotConfigured(Exception): """Indicates a missing configuration situation""" - pass diff --git a/frontera/logger/filters/__init__.py b/frontera/logger/filters/__init__.py index 92784b51d..5892727d4 100644 --- a/frontera/logger/filters/__init__.py +++ b/frontera/logger/filters/__init__.py @@ -1,48 +1,50 @@ -from __future__ import absolute_import import logging -import six -from w3lib.util import to_native_str + +from w3lib.util import to_unicode class PlainValuesFilter(logging.Filter): def __init__(self, separator=None, excluded_fields=None, msg_max_length=0): - super(PlainValuesFilter, self).__init__() - self.separator = to_native_str(separator or " ") + super().__init__() + self.separator = to_unicode(separator or " ") self.excluded_fields = excluded_fields or [] self.msg_max_length = msg_max_length def filter(self, record): if isinstance(record.msg, dict): for field_name in self.excluded_fields: - setattr(record, field_name, record.msg.get(field_name, '')) - record.msg = self.separator.join([to_native_str(value) - for key, value in six.iteritems(record.msg) - if key not in self.excluded_fields]) + setattr(record, field_name, record.msg.get(field_name, "")) + record.msg = self.separator.join( + [ + to_unicode(value) + for key, value in record.msg.items() + if key not in self.excluded_fields + ] + ) if self.msg_max_length and len(record.msg) > self.msg_max_length: - record.msg = record.msg[0:self.msg_max_length-3] + "..." + record.msg = record.msg[0 : self.msg_max_length - 3] + "..." return True class FilterFields(logging.Filter): def __init__(self, field_name): - super(FilterFields, self).__init__() + super().__init__() self.field_name = field_name def _get_field(self, record): if not self.field_name: return None - elif hasattr(record, self.field_name): + if hasattr(record, self.field_name): return getattr(record, self.field_name) - elif isinstance(record.msg, dict) and self.field_name in record.msg: + if isinstance(record.msg, dict) and self.field_name in record.msg: return record.msg[self.field_name] - else: - return None + return None class IncludeFields(FilterFields): def __init__(self, field_name, included_values): - super(IncludeFields, self).__init__(field_name) + super().__init__(field_name) self.included_values = included_values def filter(self, record): @@ -54,7 +56,7 @@ def filter(self, record): class ExcludeFields(FilterFields): def __init__(self, field_name, excluded_fields): - super(ExcludeFields, self).__init__(field_name) + super().__init__(field_name) self.excluded_fields = excluded_fields def filter(self, record): diff --git a/frontera/logger/formatters/__init__.py b/frontera/logger/formatters/__init__.py index a9f687044..f34134766 100644 --- a/frontera/logger/formatters/__init__.py +++ b/frontera/logger/formatters/__init__.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import import logging LOG_FORMAT = "[%(name)s] %(message)s" @@ -6,7 +5,7 @@ try: from .color import ColorFormatter - LOG_COLOR_FORMAT = "%(log_color)s"+LOG_FORMAT + LOG_COLOR_FORMAT = "%(log_color)s" + LOG_FORMAT COLORS = { "DEBUG": "white", "INFO": "green", @@ -16,8 +15,7 @@ } CONSOLE = ColorFormatter( - format=LOG_COLOR_FORMAT, - log_colors=COLORS.copy(), - log_color_field="levelname") + format=LOG_COLOR_FORMAT, log_colors=COLORS.copy(), log_color_field="levelname" + ) except ImportError: CONSOLE = logging.Formatter(fmt=LOG_FORMAT) diff --git a/frontera/logger/formatters/color.py b/frontera/logger/formatters/color.py index 2ff390eb4..c4f1dc950 100644 --- a/frontera/logger/formatters/color.py +++ b/frontera/logger/formatters/color.py @@ -1,22 +1,19 @@ -from __future__ import absolute_import - -import logging -import sys - -from colorlog.escape_codes import escape_codes from colorlog import ColoredFormatter +from colorlog.escape_codes import escape_codes class ColorFormatter(ColoredFormatter): - - def __init__(self, format, log_colors, log_color_field, datefmt=None, reset=True, style='%'): - super(ColorFormatter, self).__init__(fmt=format, datefmt=datefmt, log_colors=log_colors, - reset=reset, style=style) + def __init__( + self, format, log_colors, log_color_field, datefmt=None, reset=True, style="%" + ): + super().__init__( + format, datefmt=datefmt, log_colors=log_colors, reset=reset, style=style + ) self.log_color_field = log_color_field def format(self, record): if not hasattr(record, self.log_color_field): - setattr(record, self.log_color_field, '?') + setattr(record, self.log_color_field, "?") record.__dict__.update(escape_codes) @@ -28,27 +25,20 @@ def format(self, record): record.log_color = "" # Format the message - if sys.version_info > (2, 7): - message = super(ColoredFormatter, self).format(record) - else: - message = logging.Formatter.format(self, record) + message = super(ColoredFormatter, self).format(record) # Add a reset code to the end of the message # (if it wasn't explicitly added in format str) - if self.reset and not message.endswith(escape_codes['reset']): - message += escape_codes['reset'] + if self.reset and not message.endswith(escape_codes["reset"]): + message += escape_codes["reset"] return message def _get_color_field(self, record): if not self.log_color_field: return None - elif hasattr(record, self.log_color_field): + if hasattr(record, self.log_color_field): return getattr(record, self.log_color_field) - elif isinstance(record.msg, dict) and self.log_color_field in record.msg: + if isinstance(record.msg, dict) and self.log_color_field in record.msg: return record.msg[self.log_color_field] - else: - return None - - - + return None diff --git a/frontera/logger/formatters/json.py b/frontera/logger/formatters/json.py index 3efe1f24c..05548683f 100644 --- a/frontera/logger/formatters/json.py +++ b/frontera/logger/formatters/json.py @@ -1,10 +1,9 @@ -from __future__ import absolute_import - from pythonjsonlogger.jsonlogger import JsonFormatter + from frontera.utils.encoders import DateTimeEncoder class JSONFormatter(JsonFormatter): def __init__(self): json_encoder = DateTimeEncoder - super(JSONFormatter, self).__init__(json_encoder=json_encoder) + super().__init__(json_encoder=json_encoder) diff --git a/frontera/logger/handlers/__init__.py b/frontera/logger/handlers/__init__.py index 811822dcd..e9bddaaaa 100644 --- a/frontera/logger/handlers/__init__.py +++ b/frontera/logger/handlers/__init__.py @@ -1,6 +1,5 @@ -from __future__ import absolute_import -import sys import logging +import sys from frontera.logger import formatters diff --git a/frontera/settings/__init__.py b/frontera/settings/__init__.py index ff7f15b76..b461a36ad 100644 --- a/frontera/settings/__init__.py +++ b/frontera/settings/__init__.py @@ -1,17 +1,16 @@ -from __future__ import absolute_import -import six from importlib import import_module from . import default_settings -class BaseSettings(object): +class BaseSettings: """ An object that holds frontier settings values. This also defines the base interface for all classes that are to be used as settings in frontera. """ + def __init__(self, module=None, attributes=None): """ :param object/string module: A :class:`Settings ` object or a path string. @@ -39,15 +38,13 @@ def object_from(cls, settings): """ if isinstance(settings, BaseSettings): return settings - else: - return cls(settings) + return cls(settings) def __getattr__(self, name): val = self.get(name) if val is not None: return val - else: - return self.__dict__[name] + return self.__dict__[name] def __setattr__(self, name, value): if name.isupper(): @@ -56,7 +53,7 @@ def __setattr__(self, name, value): self.__dict__[name] = value def add_module(self, module): - if isinstance(module, six.string_types): + if isinstance(module, str): module = import_module(module) for key in dir(module): if key.isupper(): @@ -78,12 +75,12 @@ def set_from_dict(self, attributes): class DefaultSettings(BaseSettings): def __init__(self): - super(DefaultSettings, self).__init__(default_settings) + super().__init__(default_settings) class Settings(BaseSettings): def __init__(self, module=None, attributes=None): - super(Settings, self).__init__(default_settings, attributes) + super().__init__(default_settings, attributes) if module: self.add_module(module) diff --git a/frontera/settings/default_settings.py b/frontera/settings/default_settings.py index b049e7bdc..28e9d2bf8 100644 --- a/frontera/settings/default_settings.py +++ b/frontera/settings/default_settings.py @@ -1,39 +1,37 @@ -from __future__ import absolute_import from datetime import timedelta - AUTO_START = True -BACKEND = 'frontera.contrib.backends.memory.FIFO' +BACKEND = "frontera.contrib.backends.memory.FIFO" BC_MIN_REQUESTS = 64 BC_MIN_HOSTS = 24 BC_MAX_REQUESTS_PER_HOST = 128 -CANONICAL_SOLVER = 'frontera.contrib.canonicalsolvers.Basic' +CANONICAL_SOLVER = "frontera.contrib.canonicalsolvers.Basic" DELAY_ON_EMPTY = 5.0 -DOMAIN_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1' +DOMAIN_FINGERPRINT_FUNCTION = "frontera.utils.fingerprint.sha1" -HBASE_THRIFT_HOST = 'localhost' +HBASE_THRIFT_HOST = "localhost" HBASE_THRIFT_PORT = 9090 -HBASE_NAMESPACE = 'crawler' +HBASE_NAMESPACE = "crawler" HBASE_DROP_ALL_TABLES = False -HBASE_METADATA_TABLE = 'metadata' +HBASE_METADATA_TABLE = "metadata" HBASE_USE_SNAPPY = False HBASE_USE_FRAMED_COMPACT = False HBASE_BATCH_SIZE = 9216 HBASE_STATE_CACHE_SIZE_LIMIT = 3000000 -HBASE_QUEUE_TABLE = 'queue' +HBASE_QUEUE_TABLE = "queue" KAFKA_GET_TIMEOUT = 5.0 MAX_NEXT_REQUESTS = 64 MAX_REQUESTS = 0 -MESSAGE_BUS = 'frontera.contrib.messagebus.zeromq.MessageBus' -MESSAGE_BUS_CODEC = 'frontera.contrib.backends.remote.codecs.msgpack' +MESSAGE_BUS = "frontera.contrib.messagebus.zeromq.MessageBus" +MESSAGE_BUS_CODEC = "frontera.contrib.backends.remote.codecs.msgpack" MIDDLEWARES = [ - 'frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware', + "frontera.contrib.middlewares.fingerprint.UrlFingerprintMiddleware", ] NEW_BATCH_DELAY = 30.0 OVERUSED_SLOT_FACTOR = 5.0 QUEUE_HOSTNAME_PARTITIONING = False -REQUEST_MODEL = 'frontera.core.models.Request' -RESPONSE_MODEL = 'frontera.core.models.Response' +REQUEST_MODEL = "frontera.core.models.Request" +RESPONSE_MODEL = "frontera.core.models.Response" SCORING_PARTITION_ID = 0 SCORING_LOG_CONSUMER_BATCH_SIZE = 512 @@ -44,12 +42,12 @@ SQLALCHEMYBACKEND_CACHE_SIZE = 10000 SQLALCHEMYBACKEND_CLEAR_CONTENT = True SQLALCHEMYBACKEND_DROP_ALL_TABLES = True -SQLALCHEMYBACKEND_ENGINE = 'sqlite:///:memory:' +SQLALCHEMYBACKEND_ENGINE = "sqlite:///:memory:" SQLALCHEMYBACKEND_ENGINE_ECHO = False SQLALCHEMYBACKEND_MODELS = { - 'MetadataModel': 'frontera.contrib.backends.sqlalchemy.models.MetadataModel', - 'StateModel': 'frontera.contrib.backends.sqlalchemy.models.StateModel', - 'QueueModel': 'frontera.contrib.backends.sqlalchemy.models.QueueModel' + "MetadataModel": "frontera.contrib.backends.sqlalchemy.models.MetadataModel", + "StateModel": "frontera.contrib.backends.sqlalchemy.models.StateModel", + "QueueModel": "frontera.contrib.backends.sqlalchemy.models.QueueModel", } SQLALCHEMYBACKEND_REVISIT_INTERVAL = timedelta(days=1) STATE_CACHE_SIZE = 1000000 @@ -57,16 +55,16 @@ STORE_CONTENT = False TEST_MODE = False TLDEXTRACT_DOMAIN_INFO = False -URL_FINGERPRINT_FUNCTION = 'frontera.utils.fingerprint.sha1' +URL_FINGERPRINT_FUNCTION = "frontera.utils.fingerprint.sha1" -ZMQ_ADDRESS = '127.0.0.1' +ZMQ_ADDRESS = "127.0.0.1" ZMQ_BASE_PORT = 5550 -LOGGING_CONFIG = 'logging.conf' +LOGGING_CONFIG = "logging.conf" -#-------------------------------------------------------- +# -------------------------------------------------------- # Kafka -#-------------------------------------------------------- +# -------------------------------------------------------- SPIDER_FEED_TOPIC = "frontier-todo" SPIDER_LOG_TOPIC = "frontier-done" @@ -77,4 +75,4 @@ SCORING_LOG_DBW_GROUP = "dbw-scoring-log" SPIDER_FEED_GROUP = "fetchers-spider-feed" -KAFKA_CODEC = None \ No newline at end of file +KAFKA_CODEC = None diff --git a/frontera/utils/async.py b/frontera/utils/async_.py similarity index 75% rename from frontera/utils/async.py rename to frontera/utils/async_.py index 6b98864df..8820cefa4 100644 --- a/frontera/utils/async.py +++ b/frontera/utils/async_.py @@ -1,17 +1,17 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from twisted.internet import reactor, error +from twisted.internet import error from twisted.internet.defer import Deferred -from six.moves import range -class CallLaterOnce(object): +class CallLaterOnce: """Schedule a function to be called in the next reactor loop, but only if it hasn't been already scheduled since the last time it run. """ - def __init__(self, func, reactor=reactor, *a, **kw): + + def __init__(self, func, reactor=None, *a, **kw): + from twisted.internet import reactor as default_reactor + self._func = func - self._reactor = reactor + self._reactor = reactor or default_reactor self._a = a self._kw = kw self._call = None @@ -47,11 +47,15 @@ def error(self, f): return f -def listen_tcp(portrange, host, factory, reactor=reactor): +def listen_tcp(portrange, host, factory, reactor=None): """Like reactor.listenTCP but tries different ports in a range.""" + from twisted.internet import reactor as default_reactor + + reactor = reactor or default_reactor + if isinstance(portrange, int): return reactor.listenTCP(portrange, factory, interface=host) - assert len(portrange) <= 2, "invalid portrange: %s" % portrange + assert len(portrange) <= 2, f"invalid portrange: {portrange}" if not portrange: return reactor.listenTCP(0, factory, interface=host) if len(portrange) == 1: @@ -59,6 +63,7 @@ def listen_tcp(portrange, host, factory, reactor=reactor): for x in range(portrange[0], portrange[1] + 1): try: return reactor.listenTCP(x, factory, interface=host) - except error.CannotListenError: + except error.CannotListenError: # noqa: PERF203 if x == portrange[1]: raise + return None diff --git a/frontera/utils/converters.py b/frontera/utils/converters.py index a450d57b4..61ba9433c 100644 --- a/frontera/utils/converters.py +++ b/frontera/utils/converters.py @@ -1,5 +1,6 @@ -class BaseRequestConverter(object): +class BaseRequestConverter: """Converts between frontera and XXX request objects""" + def to_frontier(self, request): """request: XXX > Frontier""" raise NotImplementedError @@ -9,8 +10,9 @@ def from_frontier(self, request): raise NotImplementedError -class BaseResponseConverter(object): +class BaseResponseConverter: """Converts between frontera and XXX response objects""" + def to_frontier(self, response): """response: XXX > Frontier""" raise NotImplementedError diff --git a/frontera/utils/encoders.py b/frontera/utils/encoders.py index ec78ff066..61478b377 100644 --- a/frontera/utils/encoders.py +++ b/frontera/utils/encoders.py @@ -1,16 +1,11 @@ -from __future__ import absolute_import -import json import datetime +import json class DateTimeEncoder(json.JSONEncoder): def default(self, obj): - if isinstance(obj, datetime.datetime): + if isinstance(obj, (datetime.datetime, datetime.date)): return obj.isoformat() - elif isinstance(obj, datetime.date): - return obj.isoformat() - elif isinstance(obj, datetime.timedelta): + if isinstance(obj, datetime.timedelta): return (datetime.datetime.min + obj).time().isoformat() - else: - return super(DateTimeEncoder, self).default(obj) - + return super().default(obj) diff --git a/frontera/utils/fingerprint.py b/frontera/utils/fingerprint.py index 97bb55385..a4d72322a 100644 --- a/frontera/utils/fingerprint.py +++ b/frontera/utils/fingerprint.py @@ -1,19 +1,19 @@ -from __future__ import absolute_import import hashlib -from six.moves.urllib.parse import urlparse -from struct import pack from binascii import hexlify +from struct import pack + +from w3lib.util import to_bytes + from frontera.utils.misc import get_crc32 from frontera.utils.url import parse_url -from w3lib.util import to_native_str, to_bytes def sha1(key): - return to_bytes(hashlib.sha1(to_bytes(key, 'utf8')).hexdigest()) + return to_bytes(hashlib.sha1(to_bytes(key, "utf8")).hexdigest()) # noqa: S324 def md5(key): - return to_bytes(hashlib.md5(to_bytes(key, 'utf8')).hexdigest()) + return to_bytes(hashlib.md5(to_bytes(key, "utf8")).hexdigest()) # noqa: S324 def hostname_local_fingerprint(key): @@ -30,9 +30,10 @@ def hostname_local_fingerprint(key): if not result.hostname: return sha1(key) host_checksum = get_crc32(result.hostname) - doc_uri_combined = result.path+';'+result.params+result.query+result.fragment + doc_uri_combined = ( + result.path + ";" + result.params + result.query + result.fragment + ) - doc_uri_combined = to_bytes(doc_uri_combined, 'utf8', 'ignore') - doc_fprint = hashlib.md5(doc_uri_combined).digest() - fprint = hexlify(pack(">i16s", host_checksum, doc_fprint)) - return fprint + doc_uri_combined = to_bytes(doc_uri_combined, "utf8", "ignore") + doc_fprint = hashlib.md5(doc_uri_combined).digest() # noqa: S324 + return hexlify(pack(">i16s", host_checksum, doc_fprint)) diff --git a/frontera/utils/graphs/__init__.py b/frontera/utils/graphs/__init__.py index 182ae958c..b758cc1b0 100644 --- a/frontera/utils/graphs/__init__.py +++ b/frontera/utils/graphs/__init__.py @@ -1,5 +1,11 @@ -from __future__ import absolute_import from . import data from .manager import CrawlGraphManager as Manager from .models import CrawlPage as Page -from .models import CrawlPageRelation as Relation \ No newline at end of file +from .models import CrawlPageRelation as Relation + +__all__ = [ + "Manager", + "Page", + "Relation", + "data", +] diff --git a/frontera/utils/graphs/data.py b/frontera/utils/graphs/data.py index f925cdf99..5825c7abd 100644 --- a/frontera/utils/graphs/data.py +++ b/frontera/utils/graphs/data.py @@ -1,36 +1,36 @@ -from __future__ import absolute_import -from six.moves import range - - -def create_test_site(prefix, max_depth, n_links_per_page, self_link=False, site=None, depth=0): +def create_test_site( + prefix, max_depth, n_links_per_page, self_link=False, site=None, depth=0 +): if not site: site = [] prefix += str(1) depth += 1 if depth < max_depth: page = prefix - links = [page + str(l) for l in range(1, n_links_per_page+1)] + links = [page + str(link) for link in range(1, n_links_per_page + 1)] site.append((page, links)) for link in links: - create_test_site(prefix=link, - max_depth=max_depth, - n_links_per_page=n_links_per_page, - self_link=self_link, - site=site, - depth=depth) + create_test_site( + prefix=link, + max_depth=max_depth, + n_links_per_page=n_links_per_page, + self_link=self_link, + site=site, + depth=depth, + ) if self_link: links.append(page) return site -class CrawlSiteData(object): - def __init__(self, pages, name='', description=''): +class CrawlSiteData: + def __init__(self, pages, name="", description=""): self.name = name self.description = description self.pages = pages def __repr__(self): - return '' % (self.name, len(self.pages)) + return f"" @property def nodes(self): @@ -45,182 +45,214 @@ def __len__(self): return len(self.nodes) -class CrawlSiteListData(object): - def __init__(self, sites, name='', description='', use_urls=False): +class CrawlSiteListData: + def __init__(self, sites, name="", description="", use_urls=False): self.name = name self.description = description self.sites = sites self.use_urls = use_urls def __repr__(self): - return '' % (self.name, len(self.sites)) + return f"" def __len__(self): return sum([len(site) for site in self.sites]) -#----------------------------------------------------- +# ----------------------------------------------------- # Sites -#----------------------------------------------------- -SITE_A = CrawlSiteData( - name='A', - description='', - pages=create_test_site('A', 4, 2)) +# ----------------------------------------------------- +SITE_A = CrawlSiteData(name="A", description="", pages=create_test_site("A", 4, 2)) -SITE_B = CrawlSiteData( - name='B', - description='', - pages=create_test_site('B', 4, 2)) +SITE_B = CrawlSiteData(name="B", description="", pages=create_test_site("B", 4, 2)) SITE_C = CrawlSiteData( - name='C', - description='', - pages=create_test_site('C', 5, 2, self_link=True)) + name="C", description="", pages=create_test_site("C", 5, 2, self_link=True) +) -#----------------------------------------------------- +# ----------------------------------------------------- # Graphs -#----------------------------------------------------- +# ----------------------------------------------------- SITE_LIST_01 = CrawlSiteListData( - name='GRAPH 01', - description='', + name="GRAPH 01", + description="", sites=[ SITE_A, - ]) + ], +) SITE_LIST_02 = CrawlSiteListData( - name='GRAPH 02', - description='', + name="GRAPH 02", + description="", sites=[ SITE_A, SITE_B, - ]) + ], +) SITE_LIST_03 = CrawlSiteListData( - name='GRAPH 03', - description='', + name="GRAPH 03", + description="", sites=[ SITE_C, - ]) + ], +) SITE_LIST_04 = CrawlSiteListData( - name='GRAPH 04', - description='', + name="GRAPH 04", + description="", sites=[ [ - ('A', ['B']), - ('B', ['A']), + ("A", ["B"]), + ("B", ["A"]), ], - ]) + ], +) SITE_LIST_05 = CrawlSiteListData( - name='GRAPH 05', - description='', + name="GRAPH 05", + description="", sites=[ [ - ('A', ['B', 'C']), - ('B', ['A', 'C']), - ('C', ['A', 'B']), + ("A", ["B", "C"]), + ("B", ["A", "C"]), + ("C", ["A", "B"]), ], - ]) + ], +) SITE_LIST_06 = CrawlSiteListData( - name='GRAPH 06', - description='', + name="GRAPH 06", + description="", sites=[ [ - ('A', ['B', 'C']), - ('B', []), - ('C', ['B']), + ("A", ["B", "C"]), + ("B", []), + ("C", ["B"]), ] - ]) + ], +) SITE_LIST_07 = CrawlSiteListData( - name='GRAPH 07', - description='', + name="GRAPH 07", + description="", sites=[ [ - ('A', ['A']), + ("A", ["A"]), ] - ]) + ], +) SITE_LIST_08 = CrawlSiteListData( - name='GRAPH 08', - description='', + name="GRAPH 08", + description="", use_urls=True, sites=[ [ - ('https://www.a.com', [ - 'http://www.a.com/2', - 'http://www.a.net', - ]), + ( + "https://www.a.com", + [ + "http://www.a.com/2", + "http://www.a.net", + ], + ), ], [ - ('https://www.a.net', []), + ("https://www.a.net", []), ], [ - ('http://b.com', [ - 'http://b.com/2', - 'http://www.a.net', - 'http://test.cloud.c.com', - 'http://b.com', - ]), - ('http://b.com/entries?page=2', [ - 'http://b.com/entries?page=2', - 'http://b.com', - ]), + ( + "http://b.com", + [ + "http://b.com/2", + "http://www.a.net", + "http://test.cloud.c.com", + "http://b.com", + ], + ), + ( + "http://b.com/entries?page=2", + [ + "http://b.com/entries?page=2", + "http://b.com", + ], + ), ], [ - ('http://test.cloud.c.com', [ - 'http://cloud.c.com', - 'http://test.cloud.c.com/2', - ]), - ('http://test.cloud.c.com/2', [ - 'http://b.com/entries?page=2', - 'http://test.cloud.c.com', - ]), + ( + "http://test.cloud.c.com", + [ + "http://cloud.c.com", + "http://test.cloud.c.com/2", + ], + ), + ( + "http://test.cloud.c.com/2", + [ + "http://b.com/entries?page=2", + "http://test.cloud.c.com", + ], + ), ], - ]) + ], +) SITE_LIST_09 = CrawlSiteListData( - name='GRAPH 09', - description='', + name="GRAPH 09", + description="", use_urls=True, sites=[ [ - ('https://www.a.com', [ - 'http://www.a.com/2', - 'http://www.a.com/2/1', - 'http://www.a.com/3', - 'http://www.a.com/2/1/3', - 'http://www.a.com/2/4/1', - 'http://www.a.com/2/4/2', - 'http://www.a.net', - ]), + ( + "https://www.a.com", + [ + "http://www.a.com/2", + "http://www.a.com/2/1", + "http://www.a.com/3", + "http://www.a.com/2/1/3", + "http://www.a.com/2/4/1", + "http://www.a.com/2/4/2", + "http://www.a.net", + ], + ), ], [ - ('http://b.com', [ - 'http://b.com/2', - 'http://www.a.net', - 'http://test.cloud.c.com', - 'http://b.com', - ]), - ('http://b.com/entries?page=2', [ - 'http://b.com/entries?page=2', - 'http://b.com', - ]), + ( + "http://b.com", + [ + "http://b.com/2", + "http://www.a.net", + "http://test.cloud.c.com", + "http://b.com", + ], + ), + ( + "http://b.com/entries?page=2", + [ + "http://b.com/entries?page=2", + "http://b.com", + ], + ), ], [ - ('http://test.cloud.c.com', [ - 'http://cloud.c.com', - 'http://test.cloud.c.com/2', - ]), - ('http://test.cloud.c.com/2', [ - 'http://b.com/entries?page=2', - 'http://test.cloud.c.com', - ]), + ( + "http://test.cloud.c.com", + [ + "http://cloud.c.com", + "http://test.cloud.c.com/2", + ], + ), + ( + "http://test.cloud.c.com/2", + [ + "http://b.com/entries?page=2", + "http://test.cloud.c.com", + ], + ), ], - ]) + ], +) GRAPHS = [obj for obj in locals().values() if isinstance(obj, CrawlSiteListData)] -#GRAPHS = [SITE_LIST_08] +# GRAPHS = [SITE_LIST_08] diff --git a/frontera/utils/graphs/generate_diagrams.py b/frontera/utils/graphs/generate_diagrams.py index 139615dd2..87f287214 100644 --- a/frontera/utils/graphs/generate_diagrams.py +++ b/frontera/utils/graphs/generate_diagrams.py @@ -1,25 +1,22 @@ -from __future__ import absolute_import -from __future__ import print_function -import os import sys +from pathlib import Path -from .manager import CrawlGraphManager from .data import GRAPHS +from .manager import CrawlGraphManager -SCRIPT_FOLDER = os.path.abspath(os.path.split(sys.argv[0])[0]) -CHARTS_FOLDER = os.path.join(SCRIPT_FOLDER, 'diagrams') +SCRIPT_FOLDER = Path(sys.argv[0]).parent.absolute() +CHARTS_FOLDER = SCRIPT_FOLDER / "diagrams" def generate_filename(graph_name): name = graph_name - name = name.replace(' ', '_') + name = name.replace(" ", "_") name = name.lower() - name = '%s.png' % name - return name + return f"{name}.png" def generate_graph_diagram(filename, title, graph): - print("generating png diagram for test '%s'..." % title) + print(f"generating png diagram for test '{title}'...") manager = CrawlGraphManager() manager.add_site_list(graph) manager.render(filename, label=title, use_urls=graph.use_urls) @@ -27,24 +24,12 @@ def generate_graph_diagram(filename, title, graph): def generate_diagrams(): for graph in GRAPHS: - generate_graph_diagram(filename=os.path.join(CHARTS_FOLDER, generate_filename(graph.name)), - title=graph.name, - graph=graph) + generate_graph_diagram( + filename=CHARTS_FOLDER / generate_filename(graph.name), + title=graph.name, + graph=graph, + ) -if __name__ == '__main__': +if __name__ == "__main__": generate_diagrams() - - - - - - - - - - - - - - diff --git a/frontera/utils/graphs/manager.py b/frontera/utils/graphs/manager.py index 001eef1a8..7f6cc96f8 100644 --- a/frontera/utils/graphs/manager.py +++ b/frontera/utils/graphs/manager.py @@ -1,30 +1,38 @@ -from __future__ import absolute_import from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker -from .models import Base, CrawlPage from .data import CrawlSiteData, CrawlSiteListData +from .models import Base, CrawlPage -DEFAULT_ENGINE = 'sqlite:///:memory:' +DEFAULT_ENGINE = "sqlite:///:memory:" -class CrawlGraphManager(object): - def __init__(self, engine=DEFAULT_ENGINE, autocommit=False, autoflush=False, - echo=False, drop_all_tables=False, clear_content=False): +class CrawlGraphManager: + def __init__( + self, + engine=DEFAULT_ENGINE, + autocommit=False, + autoflush=False, + echo=False, + drop_all_tables=False, + clear_content=False, + ): self.engine = create_engine(engine, echo=echo) if drop_all_tables: Base.metadata.drop_all(self.engine) Base.metadata.create_all(self.engine) self.Session = sessionmaker() - self.Session.configure(bind=self.engine, autocommit=autocommit, autoflush=autoflush) + self.Session.configure( + bind=self.engine, autocommit=autocommit, autoflush=autoflush + ) self.session = self.Session() if clear_content: - for name, table in Base.metadata.tables.items(): + for table in Base.metadata.tables.values(): self.session.execute(table.delete()) @property def pages(self): - return [page for page in CrawlPage.query(self.session).all()] + return list(CrawlPage.query(self.session).all()) @property def seeds(self): @@ -58,29 +66,55 @@ def add_site(self, site, default_status=200, default_n_redirects=0): for i, (info, links) in enumerate(pages): if isinstance(info, tuple): if len(info) == 2: - status, page_url, n_redirects = (info[0], info[1], default_n_redirects) + status, page_url, n_redirects = ( + info[0], + info[1], + default_n_redirects, + ) else: status, page_url, n_redirects = info else: - status, page_url, n_redirects = (default_status, info, default_n_redirects) - page = self.add_page(url=page_url, status=status, n_redirects=n_redirects, is_seed=(i == 0)) + status, page_url, n_redirects = ( + default_status, + info, + default_n_redirects, + ) + page = self.add_page( + url=page_url, status=status, n_redirects=n_redirects, is_seed=(i == 0) + ) for link_url in links: self.add_link(page=page, url=link_url, status=default_status) def add_site_list(self, graph, default_status=200, default_n_redirects=0): sites = graph.sites if isinstance(graph, CrawlSiteListData) else graph for site in sites: - self.add_site(site=site, default_status=default_status, default_n_redirects=default_n_redirects) + self.add_site( + site=site, + default_status=default_status, + default_n_redirects=default_n_redirects, + ) def save(self): self.session.commit() - def render(self, filename, label='', labelloc='t', labeljust='c', - rankdir="TB", ranksep=0.7, - fontname='Arial', fontsize=24, - use_urls=False, - node_fixedsize='true', nodesep=0.1, node_width=0.85, node_height=0.85, node_fontsize=15, - include_ids=False): + def render( + self, + filename, + label="", + labelloc="t", + labeljust="c", + rankdir="TB", + ranksep=0.7, + fontname="Arial", + fontsize=24, + use_urls=False, + node_fixedsize="true", + nodesep=0.1, + node_width=0.85, + node_height=0.85, + node_fontsize=15, + include_ids=False, + ): import pydot # Graph @@ -92,11 +126,9 @@ def render(self, filename, label='', labelloc='t', labeljust='c', "fontsize": fontsize, } if label: - graph_args.update({ - "labelloc": labelloc, - "labeljust": labeljust, - "label": label - }) + graph_args.update( + {"labelloc": labelloc, "labeljust": labeljust, "label": label} + ) graph = pydot.Dot(**graph_args) # Node @@ -104,32 +136,42 @@ def render(self, filename, label='', labelloc='t', labeljust='c', "fontsize": node_fontsize, } if use_urls: - node_seed_shape = 'rectangle' - node_shape = 'oval' + node_seed_shape = "rectangle" + node_shape = "oval" else: - node_seed_shape = 'square' - node_shape = 'circle' - node_args.update({ - "fixedsize": node_fixedsize, - "width": node_width, - "height": node_height, - }) + node_seed_shape = "square" + node_shape = "circle" + node_args.update( + { + "fixedsize": node_fixedsize, + "width": node_width, + "height": node_height, + } + ) graph.set_node_defaults(**node_args) for page in self.pages: - graph.add_node(pydot.Node(name=self._clean_page_name(page, include_id=include_ids), - fontname=fontname, - fontsize=node_fontsize, - shape=node_seed_shape if page.is_seed else node_shape)) + graph.add_node( + pydot.Node( + name=self._clean_page_name(page, include_id=include_ids), + fontname=fontname, + fontsize=node_fontsize, + shape=node_seed_shape if page.is_seed else node_shape, + ) + ) for link in page.links: - graph.add_edge(pydot.Edge(self._clean_page_name(page, include_id=include_ids), - self._clean_page_name(link, include_id=include_ids))) + graph.add_edge( + pydot.Edge( + self._clean_page_name(page, include_id=include_ids), + self._clean_page_name(link, include_id=include_ids), + ) + ) graph.write_png(filename) def _clean_page_name(self, page, include_id): cleaned_name = page.url - cleaned_name = cleaned_name.replace('http://', '') - cleaned_name = cleaned_name.replace('https://', '') + cleaned_name = cleaned_name.replace("http://", "") + cleaned_name = cleaned_name.replace("https://", "") if include_id: - cleaned_name = "%d. %s" % (page.id, cleaned_name) - return cleaned_name \ No newline at end of file + cleaned_name = f"{page.id}. {cleaned_name}" + return cleaned_name diff --git a/frontera/utils/graphs/models.py b/frontera/utils/graphs/models.py index c4040514e..c45b3e8d0 100644 --- a/frontera/utils/graphs/models.py +++ b/frontera/utils/graphs/models.py @@ -1,10 +1,14 @@ -from __future__ import absolute_import -from sqlalchemy import Column, String, Integer, Boolean, ForeignKey -from sqlalchemy.orm import relation -from sqlalchemy import UniqueConstraint +from sqlalchemy import ( + Boolean, + Column, + ForeignKey, + Integer, + String, + UniqueConstraint, + types, +) from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import types -import six +from sqlalchemy.orm import relation Base = declarative_base() @@ -14,11 +18,11 @@ class Choice(types.TypeDecorator): def __init__(self, choices, default, **kwargs): self.choices = dict(choices) - values = [k for k, v in six.iteritems(self.choices)] + values = [k for k, v in self.choices.items()] if default not in values: - raise ValueError("default value '%s' not found in choices %s" % (default, values)) + raise ValueError(f"default value '{default}' not found in choices {values}") self.default = default - super(Choice, self).__init__(**kwargs) + super().__init__(**kwargs) def process_bind_param(self, value, dialect): return value or self.default @@ -27,7 +31,7 @@ def process_result_value(self, value, dialect): return self.choices[value] -class BaseModel(object): +class BaseModel: __abstract__ = True @classmethod @@ -51,10 +55,9 @@ def get_or_create(cls, session, **kwargs): instance = session.query(cls).filter_by(**kwargs).first() if instance: return instance, False - else: - instance = cls(**kwargs) - session.add(instance) - return instance, True + instance = cls(**kwargs) + session.add(instance) + return instance, True def get_pk(self): return getattr(self, self.get_pk_name()) @@ -69,16 +72,18 @@ class Model(Base, BaseModel): class CrawlPageRelation(Model): - __tablename__ = 'crawl_page_relations' - parent_id = Column(Integer, ForeignKey('crawl_pages.id'), primary_key=True, index=True) - child_id = Column(Integer, ForeignKey('crawl_pages.id'), primary_key=True, index=True) + __tablename__ = "crawl_page_relations" + parent_id = Column( + Integer, ForeignKey("crawl_pages.id"), primary_key=True, index=True + ) + child_id = Column( + Integer, ForeignKey("crawl_pages.id"), primary_key=True, index=True + ) class CrawlPage(Model): - __tablename__ = 'crawl_pages' - __table_args__ = ( - UniqueConstraint('url'), - ) + __tablename__ = "crawl_pages" + __table_args__ = (UniqueConstraint("url"),) id = Column(Integer, primary_key=True, nullable=False, index=True, unique=True) url = Column(String(1000)) @@ -86,14 +91,15 @@ class CrawlPage(Model): n_redirects = Column(Integer, default=0) is_seed = Column(Boolean, default=False) referers = relation( - 'CrawlPage', - secondary='crawl_page_relations', + "CrawlPage", + secondary="crawl_page_relations", primaryjoin=CrawlPageRelation.child_id == id, secondaryjoin=CrawlPageRelation.parent_id == id, - backref="links") + backref="links", + ) def __repr__(self): - return '<%s:%s%s>' % (self.id, self.url, '*' if self.is_seed else '') + return f"<{self.id}:{self.url}{'*' if self.is_seed else ''}>" def _get_status_code(self): try: @@ -110,5 +116,4 @@ def is_redirection(self): status_code = self._get_status_code() if status_code: return 300 <= status_code < 400 - else: - return False + return False diff --git a/frontera/utils/heap.py b/frontera/utils/heap.py index c2845c508..662ff8156 100644 --- a/frontera/utils/heap.py +++ b/frontera/utils/heap.py @@ -1,33 +1,27 @@ -from __future__ import absolute_import -from __future__ import print_function import heapq import math from io import StringIO -def show_tree(tree, total_width=80, fill=' '): +def show_tree(tree, total_width=80, fill=" "): """Pretty-print a tree.""" - print('-' * total_width) + print("-" * total_width) output = StringIO() last_row = -1 for i, n in enumerate(tree): - if i: - row = int(math.floor(math.log(i+1, 2))) - else: - row = 0 + row = int(math.floor(math.log2(i + 1))) if i else 0 if row != last_row: - output.write('\n') + output.write("\n") columns = 2**row col_width = int(math.floor((total_width * 1.0) / columns)) output.write(str(n).center(col_width, fill)) last_row = row print(output.getvalue()) - print('-' * total_width) + print("-" * total_width) print() - return -class HeapObjectWrapper(object): +class HeapObjectWrapper: def __init__(self, obj, compare_function): self.obj = obj self.compare_function = compare_function @@ -36,16 +30,10 @@ def __cmp__(self, other): return self.compare_function(self.obj, other.obj) def __lt__(self, other): - if self.compare_function(self.obj, other.obj) == -1: - return True - else: - return False + return self.compare_function(self.obj, other.obj) == -1 def __eq__(self, other): - if self.compare_function(self.obj, other.obj) == 0: - return True - else: - return False + return self.compare_function(self.obj, other.obj) == 0 def __repr__(self): return repr(self.obj) @@ -54,7 +42,7 @@ def __str__(self): return str(self.obj) -class Heap(object): +class Heap: def __init__(self, compare_function): self.heap = [] self._compare_function = compare_function @@ -69,8 +57,7 @@ def pop(self, n): pages.append(page) if n and len(pages) >= n: break - else: - page = self._extract_object() + page = self._extract_object() return pages def _extract_object(self): diff --git a/frontera/utils/managers.py b/frontera/utils/managers.py index 867f7a4f6..3c4487178 100644 --- a/frontera/utils/managers.py +++ b/frontera/utils/managers.py @@ -1,9 +1,9 @@ -from __future__ import absolute_import from frontera.core.manager import FrontierManager + from .converters import BaseRequestConverter, BaseResponseConverter -class FrontierManagerWrapper(object): +class FrontierManagerWrapper: def __init__(self, settings, manager=None): manager = manager or FrontierManager self.manager = manager.from_settings(settings) @@ -11,14 +11,20 @@ def __init__(self, settings, manager=None): self.response_converter = None def start(self): - if not hasattr(self, 'request_converter'): - raise NotImplementedError("Request converter should be instantiated in subclass") - if not hasattr(self, 'response_converter'): - raise NotImplementedError("Response converter should be instantiated in subclass") - assert isinstance(self.request_converter, BaseRequestConverter), 'request_converter ' \ - 'must be instance of BaseRequestConverter' - assert isinstance(self.response_converter, BaseResponseConverter), 'response_converter ' \ - 'must be instance of BaseResponseConverter' + if not hasattr(self, "request_converter"): + raise NotImplementedError( + "Request converter should be instantiated in subclass" + ) + if not hasattr(self, "response_converter"): + raise NotImplementedError( + "Response converter should be instantiated in subclass" + ) + assert isinstance(self.request_converter, BaseRequestConverter), ( + "request_converter must be instance of BaseRequestConverter" + ) + assert isinstance(self.response_converter, BaseResponseConverter), ( + "response_converter must be instance of BaseResponseConverter" + ) self.manager.start() def stop(self): @@ -29,20 +35,27 @@ def add_seeds(self, seeds): self.manager.add_seeds(seeds=frontier_seeds) def get_next_requests(self, max_next_requests=0, **kwargs): - frontier_requests = self.manager.get_next_requests(max_next_requests=max_next_requests, **kwargs) - return [self.request_converter.from_frontier(frontier_request) for frontier_request in frontier_requests] + frontier_requests = self.manager.get_next_requests( + max_next_requests=max_next_requests, **kwargs + ) + return [ + self.request_converter.from_frontier(frontier_request) + for frontier_request in frontier_requests + ] def page_crawled(self, response): self.manager.page_crawled(self.response_converter.to_frontier(response)) def links_extracted(self, request, links): frontier_links = [self.request_converter.to_frontier(link) for link in links] - self.manager.links_extracted(request=self.request_converter.to_frontier(request), - links=frontier_links) + self.manager.links_extracted( + request=self.request_converter.to_frontier(request), links=frontier_links + ) def request_error(self, request, error): - self.manager.request_error(request=self.request_converter.to_frontier(request), - error=error) + self.manager.request_error( + request=self.request_converter.to_frontier(request), error=error + ) def finished(self): return self.manager.finished diff --git a/frontera/utils/misc.py b/frontera/utils/misc.py index 15731195f..7ae91c712 100644 --- a/frontera/utils/misc.py +++ b/frontera/utils/misc.py @@ -1,9 +1,7 @@ -from __future__ import absolute_import from importlib import import_module from zlib import crc32 -from six.moves import range + from w3lib.util import to_bytes -import six def load_object(path): @@ -14,62 +12,61 @@ def load_object(path): """ try: - dot = path.rindex('.') - except ValueError: - raise ValueError("Error loading object '%s': not a full path" % path) + dot = path.rindex(".") + except ValueError as e: + raise ValueError(f"Error loading object '{path}': not a full path") from e - module, name = path[:dot], path[dot+1:] + module, name = path[:dot], path[dot + 1 :] try: mod = import_module(module) except ImportError as e: - raise ImportError("Error loading object '%s': %s" % (path, e)) + raise ImportError(f"Error loading object '{path}': {e}") from e try: obj = getattr(mod, name) - except AttributeError: - raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name)) + except AttributeError as e: + raise NameError( + f"Module '{module}' doesn't define any object named '{name}'" + ) from e return obj def get_crc32(name): - """ signed crc32 of bytes or unicode. + """signed crc32 of bytes or unicode. In python 3, return the same number as in python 2, converting to [-2**31, 2**31-1] range. This is done to maintain backwards compatibility with python 2, since checksums are stored in the database, so this allows to keep the same database schema. """ - return to_signed32(crc32(to_bytes(name, 'utf-8', 'ignore'))) + return to_signed32(crc32(to_bytes(name, "utf-8", "ignore"))) def to_signed32(x): - """ If x is an usigned 32-bit int, convert it to a signed 32-bit. - """ - return x - 0x100000000 if x > 0x7fffffff else x + """If x is an usigned 32-bit int, convert it to a signed 32-bit.""" + return x - 0x100000000 if x > 0x7FFFFFFF else x -def chunks(l, n): +def chunks(l, n): # noqa: E741 for i in range(0, len(l), n): - yield l[i:i+n] + yield l[i : i + n] def dict_to_bytes(obj): if isinstance(obj, dict): - return {dict_to_bytes(k): dict_to_bytes(v) for k, v in six.iteritems(obj)} - if isinstance(obj, six.text_type): - return obj.encode('utf8') + return {dict_to_bytes(k): dict_to_bytes(v) for k, v in obj.items()} + if isinstance(obj, str): + return obj.encode("utf8") if isinstance(obj, list): return map(dict_to_bytes, obj) - else: - return obj + return obj def dict_to_unicode(obj): if isinstance(obj, dict): - return {dict_to_unicode(k): dict_to_unicode(v) for k, v in six.iteritems(obj)} - if isinstance(obj, six.binary_type): - return obj.decode('utf8') + return {dict_to_unicode(k): dict_to_unicode(v) for k, v in obj.items()} + if isinstance(obj, bytes): + return obj.decode("utf8") if isinstance(obj, list): return map(dict_to_unicode, obj) - else: - return obj \ No newline at end of file + return obj diff --git a/frontera/utils/tester.py b/frontera/utils/tester.py index 29956406f..d25680696 100644 --- a/frontera/utils/tester.py +++ b/frontera/utils/tester.py @@ -1,14 +1,11 @@ -from __future__ import absolute_import - from collections import OrderedDict, deque -from six.moves.urllib.parse import urlparse -import six -from six.moves import range - +from urllib.parse import urlparse -class FrontierTester(object): - def __init__(self, frontier, graph_manager, downloader_simulator, max_next_requests=0): +class FrontierTester: + def __init__( + self, frontier, graph_manager, downloader_simulator, max_next_requests=0 + ): self.frontier = frontier self.graph_manager = graph_manager self.max_next_requests = max_next_requests @@ -31,7 +28,9 @@ def run(self, add_all_pages=False): self.frontier.stop() def _add_seeds(self): - self.frontier.add_seeds([self._make_request(seed.url) for seed in self.graph_manager.seeds]) + self.frontier.add_seeds( + [self._make_request(seed.url) for seed in self.graph_manager.seeds] + ) def _add_all(self): for page in self.graph_manager.pages: @@ -42,22 +41,24 @@ def _add_all(self): self.frontier.add_seeds([self._make_request(link.url)]) def _make_request(self, url): - r = self.frontier.request_model(url=url, - headers={ - b'X-Important-Header': b'Frontera' - }, - method=b'POST', - cookies={b'currency': b'USD'}) - r.meta[b'this_param'] = b'should be passed over' + r = self.frontier.request_model( + url=url, + headers={b"X-Important-Header": b"Frontera"}, + method=b"POST", + cookies={b"currency": b"USD"}, + ) + r.meta[b"this_param"] = b"should be passed over" return r def _make_response(self, url, status_code, request): - return self.frontier.response_model(url=url, status_code=status_code, request=request) + return self.frontier.response_model( + url=url, status_code=status_code, request=request + ) def _run_iteration(self): kwargs = self.downloader_simulator.downloader_info() if self.max_next_requests: - kwargs['max_next_requests'] = self.max_next_requests + kwargs["max_next_requests"] = self.max_next_requests requests = self.frontier.get_next_requests(**kwargs) @@ -66,23 +67,28 @@ def _run_iteration(self): for page_to_crawl in self.downloader_simulator.download(): crawled_page = self.graph_manager.get_page(url=page_to_crawl.url) if not crawled_page.has_errors: - response = self._make_response(url=page_to_crawl.url, - status_code=crawled_page.status, - request=page_to_crawl) + response = self._make_response( + url=page_to_crawl.url, + status_code=crawled_page.status, + request=page_to_crawl, + ) self.frontier.page_crawled(response=response) - self.frontier.links_extracted(request=response.request, - links=[self._make_request(link.url) for link in crawled_page.links]) + self.frontier.links_extracted( + request=response.request, + links=[self._make_request(link.url) for link in crawled_page.links], + ) else: - self.frontier.request_error(request=page_to_crawl, - error=crawled_page.status) - assert page_to_crawl.meta[b'this_param'] == b'should be passed over' - assert page_to_crawl.headers[b'X-Important-Header'] == b'Frontera' - assert page_to_crawl.method == b'POST' - assert page_to_crawl.cookies[b'currency'] == b'USD' + self.frontier.request_error( + request=page_to_crawl, error=crawled_page.status + ) + assert page_to_crawl.meta[b"this_param"] == b"should be passed over" + assert page_to_crawl.headers[b"X-Important-Header"] == b"Frontera" + assert page_to_crawl.method == b"POST" + assert page_to_crawl.cookies[b"currency"] == b"USD" return (requests, self.frontier.iteration, kwargs) -class BaseDownloaderSimulator(object): +class BaseDownloaderSimulator: def __init__(self): self.requests = None @@ -93,10 +99,7 @@ def download(self): return self.requests def downloader_info(self): - return { - 'key_type': 'domain', - 'overused_keys': [] - } + return {"key_type": "domain", "overused_keys": []} def idle(self): return True @@ -106,19 +109,19 @@ class DownloaderSimulator(BaseDownloaderSimulator): def __init__(self, rate): self._requests_per_slot = rate self.slots = OrderedDict() - super(DownloaderSimulator, self).__init__() + super().__init__() def update(self, requests): for request in requests: - hostname = urlparse(request.url).hostname or '' + hostname = urlparse(request.url).hostname or "" self.slots.setdefault(hostname, deque()).append(request) def download(self): output = [] _trash_can = [] - for key, requests in six.iteritems(self.slots): - for i in range(min(len(requests), self._requests_per_slot)): - output.append(requests.popleft()) + for key, requests in self.slots.items(): + for _i in range(min(len(requests), self._requests_per_slot)): + output.append(requests.popleft()) # noqa: PERF401 if not requests: _trash_can.append(key) @@ -127,13 +130,10 @@ def download(self): return output def downloader_info(self): - info = { - 'key_type': 'domain', - 'overused_keys': [] - } - for key, requests in six.iteritems(self.slots): + info = {"key_type": "domain", "overused_keys": []} + for key, requests in self.slots.items(): if len(requests) > self._requests_per_slot: - info['overused_keys'].append(key) + info["overused_keys"].append(key) return info def idle(self): diff --git a/frontera/utils/url.py b/frontera/utils/url.py index 87ebb2237..e747b3a07 100644 --- a/frontera/utils/url.py +++ b/frontera/utils/url.py @@ -1,14 +1,15 @@ -from __future__ import absolute_import -from six.moves.urllib import parse -from w3lib.util import to_native_str +from urllib import parse + +from w3lib.util import to_unicode def parse_url(url, encoding=None): """Return urlparsed url from the given argument (which could be an already parsed url) """ - return url if isinstance(url, parse.ParseResult) else \ - parse.urlparse(to_native_str(url)) + return ( + url if isinstance(url, parse.ParseResult) else parse.urlparse(to_unicode(url)) + ) def parse_domain_from_url(url): @@ -24,14 +25,15 @@ def parse_domain_from_url(url): ------------------------------------------------------------------------------------------------------- """ import tldextract + extracted = tldextract.extract(url) scheme, _, _, _, _, _ = parse_url(url) sld = extracted.domain tld = extracted.suffix subdomain = extracted.subdomain - name = '.'.join([sld, tld]) if tld else sld - netloc = '.'.join([subdomain, name]) if subdomain else name + name = f"{sld}.{tld}" if tld else sld + netloc = f"{subdomain}.{name}" if subdomain else name return netloc, name, scheme, sld, tld, subdomain diff --git a/frontera/worker/__init__.py b/frontera/worker/__init__.py index 7c68785e9..e69de29bb 100644 --- a/frontera/worker/__init__.py +++ b/frontera/worker/__init__.py @@ -1 +0,0 @@ -# -*- coding: utf-8 -*- \ No newline at end of file diff --git a/frontera/worker/db.py b/frontera/worker/db.py index 3c9647adb..2b6c32db6 100644 --- a/frontera/worker/db.py +++ b/frontera/worker/db.py @@ -1,32 +1,37 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import import logging -from traceback import format_stack -from signal import signal, SIGUSR1 -from logging.config import fileConfig from argparse import ArgumentParser +from logging.config import fileConfig +from pathlib import Path +from signal import SIGUSR1, signal from time import asctime -from os.path import exists +from traceback import format_stack + +from twisted.internet import task -from twisted.internet import reactor, task from frontera.core.components import DistributedBackend from frontera.core.manager import FrontierManager -from frontera.utils.url import parse_domain_from_url_fast from frontera.logger.handlers import CONSOLE - from frontera.settings import Settings +from frontera.utils.async_ import CallLaterOnce from frontera.utils.misc import load_object -from frontera.utils.async import CallLaterOnce +from frontera.utils.url import parse_domain_from_url_fast + from .server import WorkerJsonRpcService -import six -from six.moves import map logger = logging.getLogger("db-worker") -class Slot(object): - def __init__(self, new_batch, consume_incoming, consume_scoring, no_batches, no_scoring_log, - new_batch_delay, no_spider_log): +class Slot: + def __init__( + self, + new_batch, + consume_incoming, + consume_scoring, + no_batches, + no_scoring_log, + new_batch_delay, + no_spider_log, + ): self.new_batch = CallLaterOnce(new_batch) self.new_batch.setErrback(self.error) @@ -61,23 +66,25 @@ def schedule(self, on_start=False): self.scheduling.schedule(5.0) -class DBWorker(object): +class DBWorker: def __init__(self, settings, no_batches, no_incoming, no_scoring): - messagebus = load_object(settings.get('MESSAGE_BUS')) + messagebus = load_object(settings.get("MESSAGE_BUS")) self.mb = messagebus(settings) spider_log = self.mb.spider_log() self.spider_feed = self.mb.spider_feed() - self.spider_log_consumer = spider_log.consumer(partition_id=None, type=b'db') + self.spider_log_consumer = spider_log.consumer(partition_id=None, type=b"db") self.spider_feed_producer = self.spider_feed.producer() self._manager = FrontierManager.from_settings(settings, db_worker=True) self._backend = self._manager.backend - codec_path = settings.get('MESSAGE_BUS_CODEC') - encoder_cls = load_object(codec_path+".Encoder") - decoder_cls = load_object(codec_path+".Decoder") + codec_path = settings.get("MESSAGE_BUS_CODEC") + encoder_cls = load_object(codec_path + ".Encoder") + decoder_cls = load_object(codec_path + ".Decoder") self._encoder = encoder_cls(self._manager.request_model) - self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model) + self._decoder = decoder_cls( + self._manager.request_model, self._manager.response_model + ) if isinstance(self._backend, DistributedBackend) and not no_scoring: scoring_log = self.mb.scoring_log() @@ -86,17 +93,32 @@ def __init__(self, settings, no_batches, no_incoming, no_scoring): self.strategy_disabled = False else: self.strategy_disabled = True - self.spider_log_consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') - self.scoring_log_consumer_batch_size = settings.get('SCORING_LOG_CONSUMER_BATCH_SIZE') - self.spider_feed_partitioning = 'fingerprint' if not settings.get('QUEUE_HOSTNAME_PARTITIONING') else 'hostname' + self.spider_log_consumer_batch_size = settings.get( + "SPIDER_LOG_CONSUMER_BATCH_SIZE" + ) + self.scoring_log_consumer_batch_size = settings.get( + "SCORING_LOG_CONSUMER_BATCH_SIZE" + ) + self.spider_feed_partitioning = ( + "fingerprint" + if not settings.get("QUEUE_HOSTNAME_PARTITIONING") + else "hostname" + ) self.max_next_requests = settings.MAX_NEXT_REQUESTS - self.slot = Slot(self.new_batch, self.consume_incoming, self.consume_scoring, no_batches, - self.strategy_disabled, settings.get('NEW_BATCH_DELAY'), no_incoming) + self.slot = Slot( + self.new_batch, + self.consume_incoming, + self.consume_scoring, + no_batches, + self.strategy_disabled, + settings.get("NEW_BATCH_DELAY"), + no_incoming, + ) self.job_id = 0 self.stats = { - 'consumed_since_start': 0, - 'consumed_scoring_since_start': 0, - 'pushed_since_start': 0 + "consumed_since_start": 0, + "consumed_scoring_since_start": 0, + "pushed_since_start": 0, } self._logging_task = task.LoopingCall(self.log_status) @@ -104,14 +126,16 @@ def set_process_info(self, process_info): self.process_info = process_info def run(self): + from twisted.internet import reactor + def debug(sig, frame): logger.critical("Signal received: printing stack trace") - logger.critical(str("").join(format_stack(frame))) + logger.critical("".join(format_stack(frame))) self.slot.schedule(on_start=True) self._logging_task.start(30) signal(SIGUSR1, debug) - reactor.addSystemEventTrigger('before', 'shutdown', self.stop) + reactor.addSystemEventTrigger("before", "shutdown", self.stop) reactor.run() def stop(self): @@ -119,7 +143,7 @@ def stop(self): self._manager.stop() def log_status(self): - for k, v in six.iteritems(self.stats): + for k, v in self.stats.items(): logger.info("%s=%s", k, v) def disable_new_batches(self): @@ -130,58 +154,68 @@ def enable_new_batches(self): def consume_incoming(self, *args, **kwargs): consumed = 0 - for m in self.spider_log_consumer.get_messages(timeout=1.0, count=self.spider_log_consumer_batch_size): + for m in self.spider_log_consumer.get_messages( + timeout=1.0, count=self.spider_log_consumer_batch_size + ): try: msg = self._decoder.decode(m) - except (KeyError, TypeError) as e: + except (KeyError, TypeError) as e: # noqa: PERF203 logger.error("Decoding error: %s", e) continue else: type = msg[0] - if type == 'add_seeds': + if type == "add_seeds": _, seeds = msg - logger.info('Adding %i seeds', len(seeds)) + logger.info("Adding %i seeds", len(seeds)) for seed in seeds: - logger.debug('URL: %s', seed.url) + logger.debug("URL: %s", seed.url) self._backend.add_seeds(seeds) continue - if type == 'page_crawled': + if type == "page_crawled": _, response = msg logger.debug("Page crawled %s", response.url) - if b'jid' not in response.meta or response.meta[b'jid'] != self.job_id: + if ( + b"jid" not in response.meta + or response.meta[b"jid"] != self.job_id + ): continue self._backend.page_crawled(response) continue - if type == 'links_extracted': + if type == "links_extracted": _, request, links = msg logger.debug("Links extracted %s (%d)", request.url, len(links)) - if b'jid' not in request.meta or request.meta[b'jid'] != self.job_id: + if ( + b"jid" not in request.meta + or request.meta[b"jid"] != self.job_id + ): continue self._backend.links_extracted(request, links) continue - if type == 'request_error': + if type == "request_error": _, request, error = msg logger.debug("Request error %s", request.url) - if b'jid' not in request.meta or request.meta[b'jid'] != self.job_id: + if ( + b"jid" not in request.meta + or request.meta[b"jid"] != self.job_id + ): continue self._backend.request_error(request, error) continue - if type == 'offset': + if type == "offset": _, partition_id, offset = msg producer_offset = self.spider_feed_producer.get_offset(partition_id) if producer_offset is None: continue + lag = producer_offset - offset + if lag < 0: + # non-sense in general, happens when SW is restarted and not synced yet with Spiders. + continue + if lag < self.max_next_requests or offset == 0: + self.spider_feed.mark_ready(partition_id) else: - lag = producer_offset - offset - if lag < 0: - # non-sense in general, happens when SW is restarted and not synced yet with Spiders. - continue - if lag < self.max_next_requests or offset == 0: - self.spider_feed.mark_ready(partition_id) - else: - self.spider_feed.mark_busy(partition_id) + self.spider_feed.mark_busy(partition_id) continue - logger.debug('Unknown message type %s', type) + logger.debug("Unknown message type %s", type) finally: consumed += 1 """ @@ -190,9 +224,9 @@ def consume_incoming(self, *args, **kwargs): logger.info("Crawling is finished.") reactor.stop() """ - self.stats['consumed_since_start'] += consumed - self.stats['last_consumed'] = consumed - self.stats['last_consumption_run'] = asctime() + self.stats["consumed_since_start"] += consumed + self.stats["last_consumed"] = consumed + self.stats["last_consumption_run"] = asctime() self.slot.schedule() return consumed @@ -200,90 +234,117 @@ def consume_scoring(self, *args, **kwargs): consumed = 0 seen = set() batch = [] - for m in self.scoring_log_consumer.get_messages(count=self.scoring_log_consumer_batch_size): + for m in self.scoring_log_consumer.get_messages( + count=self.scoring_log_consumer_batch_size + ): try: msg = self._decoder.decode(m) - except (KeyError, TypeError) as e: + except (KeyError, TypeError) as e: # noqa: PERF203 logger.error("Decoding error: %s", e) continue else: - if msg[0] == 'update_score': + if msg[0] == "update_score": _, request, score, schedule = msg - if request.meta[b'fingerprint'] not in seen: - batch.append((request.meta[b'fingerprint'], score, request, schedule)) - seen.add(request.meta[b'fingerprint']) - if msg[0] == 'new_job_id': + if request.meta[b"fingerprint"] not in seen: + batch.append( + (request.meta[b"fingerprint"], score, request, schedule) + ) + seen.add(request.meta[b"fingerprint"]) + if msg[0] == "new_job_id": self.job_id = msg[1] finally: consumed += 1 self.queue.schedule(batch) - self.stats['consumed_scoring_since_start'] += consumed - self.stats['last_consumed_scoring'] = consumed - self.stats['last_consumption_run_scoring'] = asctime() + self.stats["consumed_scoring_since_start"] += consumed + self.stats["last_consumed_scoring"] = consumed + self.stats["last_consumption_run_scoring"] = asctime() self.slot.schedule() def new_batch(self, *args, **kwargs): def get_hostname(request): try: - netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast(request.url) + netloc, name, scheme, sld, tld, subdomain = parse_domain_from_url_fast( + request.url + ) except Exception as e: - logger.error("URL parsing error %s, fingerprint %s, url %s" % (e, request.meta[b'fingerprint'], - request.url)) + logger.error( + "URL parsing error {}, fingerprint {}, url {}".format( + e, request.meta[b"fingerprint"], request.url + ) + ) return None else: - return name.encode('utf-8', 'ignore') + return name.encode("utf-8", "ignore") def get_fingerprint(request): - return request.meta[b'fingerprint'] + return request.meta[b"fingerprint"] partitions = self.spider_feed.available_partitions() - logger.info("Getting new batches for partitions %s" % str(",").join(map(str, partitions))) + logger.info( + f"Getting new batches for partitions {','.join(map(str, partitions))}" + ) if not partitions: return 0 count = 0 - if self.spider_feed_partitioning == 'hostname': + if self.spider_feed_partitioning == "hostname": get_key = get_hostname - elif self.spider_feed_partitioning == 'fingerprint': + elif self.spider_feed_partitioning == "fingerprint": get_key = get_fingerprint else: raise Exception("Unexpected value in self.spider_feed_partitioning") - for request in self._backend.get_next_requests(self.max_next_requests, partitions=partitions): + for request in self._backend.get_next_requests( + self.max_next_requests, partitions=partitions + ): try: - request.meta[b'jid'] = self.job_id + request.meta[b"jid"] = self.job_id eo = self._encoder.encode_request(request) except Exception as e: - logger.error("Encoding error, %s, fingerprint: %s, url: %s" % (e, - request.meta[b'fingerprint'], - request.url)) + logger.error( + "Encoding error, {}, fingerprint: {}, url: {}".format( + e, request.meta[b"fingerprint"], request.url + ) + ) continue finally: count += 1 self.spider_feed_producer.send(get_key(request), eo) - self.stats['pushed_since_start'] += count - self.stats['last_batch_size'] = count - self.stats.setdefault('batches_after_start', 0) - self.stats['batches_after_start'] += 1 - self.stats['last_batch_generated'] = asctime() + self.stats["pushed_since_start"] += count + self.stats["last_batch_size"] = count + self.stats.setdefault("batches_after_start", 0) + self.stats["batches_after_start"] += 1 + self.stats["last_batch_generated"] = asctime() return count -if __name__ == '__main__': +if __name__ == "__main__": parser = ArgumentParser(description="Frontera DB worker.") - parser.add_argument('--no-batches', action='store_true', - help='Disables generation of new batches.') - parser.add_argument('--no-incoming', action='store_true', - help='Disables spider log processing.') - parser.add_argument('--no-scoring', action='store_true', - help='Disables scoring log processing.') - parser.add_argument('--config', type=str, required=True, - help='Settings module name, should be accessible by import.') - parser.add_argument('--log-level', '-L', type=str, default='INFO', - help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL.") - parser.add_argument('--port', type=int, help="Json Rpc service port to listen.") + parser.add_argument( + "--no-batches", action="store_true", help="Disables generation of new batches." + ) + parser.add_argument( + "--no-incoming", action="store_true", help="Disables spider log processing." + ) + parser.add_argument( + "--no-scoring", action="store_true", help="Disables scoring log processing." + ) + parser.add_argument( + "--config", + type=str, + required=True, + help="Settings module name, should be accessible by import.", + ) + parser.add_argument( + "--log-level", + "-L", + type=str, + default="INFO", + help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL.", + ) + parser.add_argument("--port", type=int, help="Json Rpc service port to listen.") args = parser.parse_args() settings = Settings(module=args.config) @@ -291,7 +352,7 @@ def get_fingerprint(request): settings.set("JSONRPC_PORT", [args.port]) logging_config_path = settings.get("LOGGING_CONFIG") - if logging_config_path and exists(logging_config_path): + if logging_config_path and Path(logging_config_path).exists(): fileConfig(logging_config_path) else: logging.basicConfig(level=args.log_level) @@ -302,4 +363,3 @@ def get_fingerprint(request): server = WorkerJsonRpcService(worker, settings) server.start_listening() worker.run() - diff --git a/frontera/worker/server.py b/frontera/worker/server.py index a77a49bae..51010734f 100644 --- a/frontera/worker/server.py +++ b/frontera/worker/server.py @@ -1,13 +1,11 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from logging import getLogger from json import JSONDecoder, JSONEncoder +from logging import getLogger from sys import exc_info from traceback import format_exception -from twisted.web import server, resource +from twisted.web import resource, server -from frontera.utils.async import listen_tcp +from frontera.utils.async_ import listen_tcp logger = getLogger("cf-server") @@ -15,27 +13,26 @@ def jsonrpc_error(id, code, message, data=None): """Create JSON-RPC error response""" return { - 'jsonrpc': '2.0', - 'error': { - 'code': code, - 'message': message, - 'data': data, + "jsonrpc": "2.0", + "error": { + "code": code, + "message": message, + "data": data, }, - 'id': id, + "id": id, } def jsonrpc_result(id, result): """Create JSON-RPC result response""" return { - 'jsonrpc': '2.0', - 'result': result, - 'id': id, + "jsonrpc": "2.0", + "result": result, + "id": id, } class JsonRpcError(Exception): - def __init__(self, code, message): self.code = code self.message = message @@ -45,7 +42,6 @@ def __call__(self, id): class JsonResource(resource.Resource): - json_encoder = JSONEncoder() json_decoder = JSONDecoder() @@ -55,15 +51,17 @@ def render(self, txrequest): def render_object(self, obj, txrequest): r = self.json_encoder.encode(obj) + "\n" - txrequest.setHeader('Content-Type', 'application/json') - txrequest.setHeader('Access-Control-Allow-Origin', '*') - txrequest.setHeader('Access-Control-Allow-Methods', 'GET, POST, PATCH, PUT, DELETE') - txrequest.setHeader('Access-Control-Allow-Headers', 'X-Requested-With') - txrequest.setHeader('Content-Length', len(r)) + txrequest.setHeader("Content-Type", "application/json") + txrequest.setHeader("Access-Control-Allow-Origin", "*") + txrequest.setHeader( + "Access-Control-Allow-Methods", "GET, POST, PATCH, PUT, DELETE" + ) + txrequest.setHeader("Access-Control-Allow-Headers", "X-Requested-With") + txrequest.setHeader("Content-Length", len(r)) return r def parse_jsonrpc(self, txrequest): - if hasattr(txrequest.content, 'read'): + if hasattr(txrequest.content, "read"): data = txrequest.content.read() else: data = txrequest.content.getvalue() @@ -71,8 +69,7 @@ def parse_jsonrpc(self, txrequest): class StatusResource(JsonResource): - - ws_name = 'status' + ws_name = "status" def __init__(self, worker): self.worker = worker @@ -80,22 +77,21 @@ def __init__(self, worker): def render_GET(self, txrequest): return { - 'is_finishing': self.worker.slot.is_finishing, - 'disable_new_batches': self.worker.slot.no_batches, - 'stats': self.worker.stats + "is_finishing": self.worker.slot.is_finishing, + "disable_new_batches": self.worker.slot.no_batches, + "stats": self.worker.stats, } class JsonRpcResource(JsonResource): - - ws_name = 'jsonrpc' + ws_name = "jsonrpc" def __init__(self): JsonResource.__init__(self) def render_POST(self, txrequest): jrequest = self.parse_jsonrpc(txrequest) - method = jrequest['method'] + method = jrequest["method"] try: try: return self.process_request(method, jrequest) @@ -103,52 +99,52 @@ def render_POST(self, txrequest): if isinstance(err, JsonRpcError): raise err trace_lines = format_exception(*exc_info()) - raise JsonRpcError(500, "Error processing request: %s" % (str("").join(trace_lines))) + raise JsonRpcError( + 500, f"Error processing request: {''.join(trace_lines)}" + ) from err except JsonRpcError as err: - return err(jrequest['id']) + return err(jrequest["id"]) class WorkerJsonRpcResource(JsonRpcResource): - def __init__(self, worker): self.worker = worker JsonRpcResource.__init__(self) def process_request(self, method, jrequest): - if method == 'disable_new_batches': + if method == "disable_new_batches": self.worker.disable_new_batches() - return jsonrpc_result(jrequest['id'], "success") + return jsonrpc_result(jrequest["id"], "success") - if method == 'enable_new_batches': + if method == "enable_new_batches": self.worker.enable_new_batches() - return jsonrpc_result(jrequest['id'], "success") + return jsonrpc_result(jrequest["id"], "success") raise JsonRpcError(400, "Unknown method") class RootResource(JsonResource): - def render_GET(self, txrequest): - return {'resources': list(self.children.keys())} + return {"resources": list(self.children.keys())} def getChild(self, name, txrequest): - if name == '': + if name == "": return self return JsonResource.getChild(self, name, txrequest) class JsonRpcService(server.Site): def __init__(self, root, settings): - logfile = settings.get('JSONRPC_LOGFILE') - self.portrange = settings.get('JSONRPC_PORT', [6023, 6073]) - self.host = settings.get('JSONRPC_HOST', '127.0.0.1') + logfile = settings.get("JSONRPC_LOGFILE") + self.portrange = settings.get("JSONRPC_PORT", [6023, 6073]) + self.host = settings.get("JSONRPC_HOST", "127.0.0.1") server.Site.__init__(self, root, logPath=logfile) self.noisy = False def start_listening(self): self.port = listen_tcp(self.portrange, self.host, self) - h = self.port.getHost() - logger.info('Web service listening on %(host)s:%(port)d'.format(host=h.host, port=h.port)) + address = self.port.getHost() + logger.info(f"Web service listening on {address.host}:{address.port}") def stop_listening(self): self.port.stopListening() @@ -157,12 +153,12 @@ def stop_listening(self): class WorkerJsonRpcService(JsonRpcService): def __init__(self, worker, settings): root = RootResource() - root.putChild('status', StatusResource(worker)) - root.putChild('jsonrpc', WorkerJsonRpcResource(worker)) + root.putChild("status", StatusResource(worker)) + root.putChild("jsonrpc", WorkerJsonRpcResource(worker)) JsonRpcService.__init__(self, root, settings) self.worker = worker def start_listening(self): JsonRpcService.start_listening(self) address = self.port.getHost() - self.worker.set_process_info("%s:%d" % (address.host, address.port)) + self.worker.set_process_info(f"{address.host}:{address.port}") diff --git a/frontera/worker/strategies/__init__.py b/frontera/worker/strategies/__init__.py index 1b46f5d96..1e7411ee1 100644 --- a/frontera/worker/strategies/__init__.py +++ b/frontera/worker/strategies/__init__.py @@ -1,14 +1,10 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from frontera.core.models import Request -from frontera.contrib.middlewares.fingerprint import UrlFingerprintMiddleware - from abc import ABCMeta, abstractmethod -import six + +from frontera.contrib.middlewares.fingerprint import UrlFingerprintMiddleware +from frontera.core.models import Request -@six.add_metaclass(ABCMeta) -class BaseCrawlingStrategy(object): +class BaseCrawlingStrategy(metaclass=ABCMeta): """ Interface definition for a crawling strategy. @@ -96,7 +92,9 @@ def schedule(self, request, score=1.0, dont_queue=False): """ self._mb_stream.send(request, score, dont_queue) - def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=None, body=b''): + def create_request( + self, url, method=b"GET", headers=None, cookies=None, meta=None, body=b"" + ): """ Creates request with specified fields, with state fetched from backend. This method only creates request, but isn't getting it's state from storage. Use self.refresh_states on a batch of requests to get their states @@ -110,7 +108,9 @@ def create_request(self, url, method=b'GET', headers=None, cookies=None, meta=No :param body: str :return: :class:`Request ` """ - r = Request(url, method=method, headers=headers, cookies=cookies, meta=meta, body=body) + r = Request( + url, method=method, headers=headers, cookies=cookies, meta=meta, body=body + ) self.url_mw._add_fingerprint(r) return r diff --git a/frontera/worker/strategies/bfs.py b/frontera/worker/strategies/bfs.py index 838498d7f..13f69eb2a 100644 --- a/frontera/worker/strategies/bfs.py +++ b/frontera/worker/strategies/bfs.py @@ -1,32 +1,30 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from six.moves.urllib.parse import urlparse +from urllib.parse import urlparse + from frontera.core.components import States from frontera.worker.strategies import BaseCrawlingStrategy class CrawlingStrategy(BaseCrawlingStrategy): - def add_seeds(self, seeds): for seed in seeds: - if seed.meta[b'state'] is States.NOT_CRAWLED: - seed.meta[b'state'] = States.QUEUED + if seed.meta[b"state"] is States.NOT_CRAWLED: + seed.meta[b"state"] = States.QUEUED self.schedule(seed) def page_crawled(self, response): - response.meta[b'state'] = States.CRAWLED + response.meta[b"state"] = States.CRAWLED def links_extracted(self, request, links): for link in links: - if link.meta[b'state'] is States.NOT_CRAWLED: - link.meta[b'state'] = States.QUEUED + if link.meta[b"state"] is States.NOT_CRAWLED: + link.meta[b"state"] = States.QUEUED self.schedule(link, self.get_score(link.url)) def page_error(self, request, error): - request.meta[b'state'] = States.ERROR + request.meta[b"state"] = States.ERROR self.schedule(request, score=0.0, dont_queue=True) def get_score(self, url): url_parts = urlparse(url) - path_parts = url_parts.path.split('/') + path_parts = url_parts.path.split("/") return 1.0 / (max(len(path_parts), 1.0) + len(url_parts.path) * 0.1) diff --git a/frontera/worker/strategy.py b/frontera/worker/strategy.py index a33008ca3..5ac2bc503 100644 --- a/frontera/worker/strategy.py +++ b/frontera/worker/strategy.py @@ -1,29 +1,24 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from time import asctime import logging -from traceback import format_stack, format_tb -from signal import signal, SIGUSR1 -from logging.config import fileConfig from argparse import ArgumentParser -from os.path import exists -from frontera.utils.misc import load_object +from binascii import hexlify +from collections.abc import Iterable +from logging.config import fileConfig +from pathlib import Path +from signal import SIGUSR1, signal +from time import asctime +from traceback import format_stack, format_tb -from frontera.core.manager import FrontierManager -from frontera.logger.handlers import CONSOLE from twisted.internet.task import LoopingCall -from twisted.internet import reactor +from frontera.core.manager import FrontierManager +from frontera.logger.handlers import CONSOLE from frontera.settings import Settings -from collections import Iterable -from binascii import hexlify -import six - +from frontera.utils.misc import load_object logger = logging.getLogger("strategy-worker") -class UpdateScoreStream(object): +class UpdateScoreStream: def __init__(self, encoder, scoring_log_producer, size): self._encoder = encoder self._buffer = [] @@ -31,11 +26,7 @@ def __init__(self, encoder, scoring_log_producer, size): self._size = size def send(self, request, score=1.0, dont_queue=False): - encoded = self._encoder.encode_update_score( - request, - score, - not dont_queue - ) + encoded = self._encoder.encode_update_score(request, score, not dont_queue) self._buffer.append(encoded) if len(self._buffer) > self._size: self.flush() @@ -46,8 +37,7 @@ def flush(self): self._buffer = [] -class StatesContext(object): - +class StatesContext: def __init__(self, states): self._requests = [] self._states = states @@ -55,9 +45,9 @@ def __init__(self, states): def to_fetch(self, requests): if isinstance(requests, Iterable): - self._fingerprints.update(x.meta[b'fingerprint'] for x in requests) + self._fingerprints.update(x.meta[b"fingerprint"] for x in requests) return - self._fingerprints.add(requests.meta[b'fingerprint']) + self._fingerprints.add(requests.meta[b"fingerprint"]) def fetch(self): self._states.fetch(self._fingerprints) @@ -79,43 +69,49 @@ def flush(self): logger.info("Flushing of states finished") -class StrategyWorker(object): +class StrategyWorker: def __init__(self, settings, strategy_class): - partition_id = settings.get('SCORING_PARTITION_ID') - if partition_id is None or type(partition_id) != int: + partition_id = settings.get("SCORING_PARTITION_ID") + if partition_id is None or not isinstance(partition_id, int): raise AttributeError("Scoring worker partition id isn't set.") - messagebus = load_object(settings.get('MESSAGE_BUS')) + messagebus = load_object(settings.get("MESSAGE_BUS")) mb = messagebus(settings) spider_log = mb.spider_log() scoring_log = mb.scoring_log() - self.consumer = spider_log.consumer(partition_id=partition_id, type=b'sw') + self.consumer = spider_log.consumer(partition_id=partition_id, type=b"sw") self.scoring_log_producer = scoring_log.producer() self._manager = FrontierManager.from_settings(settings, strategy_worker=True) - codec_path = settings.get('MESSAGE_BUS_CODEC') - encoder_cls = load_object(codec_path+".Encoder") - decoder_cls = load_object(codec_path+".Decoder") - self._decoder = decoder_cls(self._manager.request_model, self._manager.response_model) + codec_path = settings.get("MESSAGE_BUS_CODEC") + encoder_cls = load_object(codec_path + ".Encoder") + decoder_cls = load_object(codec_path + ".Decoder") + self._decoder = decoder_cls( + self._manager.request_model, self._manager.response_model + ) self._encoder = encoder_cls(self._manager.request_model) - self.update_score = UpdateScoreStream(self._encoder, self.scoring_log_producer, 1024) + self.update_score = UpdateScoreStream( + self._encoder, self.scoring_log_producer, 1024 + ) self.states_context = StatesContext(self._manager.backend.states) - self.consumer_batch_size = settings.get('SPIDER_LOG_CONSUMER_BATCH_SIZE') - self.strategy = strategy_class.from_worker(self._manager, self.update_score, self.states_context) + self.consumer_batch_size = settings.get("SPIDER_LOG_CONSUMER_BATCH_SIZE") + self.strategy = strategy_class.from_worker( + self._manager, self.update_score, self.states_context + ) self.states = self._manager.backend.states - self.stats = { - 'consumed_since_start': 0 - } + self.stats = {"consumed_since_start": 0} self.job_id = 0 self.task = LoopingCall(self.work) self._logging_task = LoopingCall(self.log_status) self._flush_states_task = LoopingCall(self.flush_states) - logger.info("Strategy worker is initialized and consuming partition %d", partition_id) + logger.info( + "Strategy worker is initialized and consuming partition %d", partition_id + ) def collect_unknown_message(self, msg): - logger.debug('Unknown message %s', msg) + logger.debug("Unknown message %s", msg) def on_unknown_message(self, msg): pass @@ -123,10 +119,12 @@ def on_unknown_message(self, msg): def collect_batch(self): consumed = 0 batch = [] - for m in self.consumer.get_messages(count=self.consumer_batch_size, timeout=1.0): + for m in self.consumer.get_messages( + count=self.consumer_batch_size, timeout=1.0 + ): try: msg = self._decoder.decode(m) - except (KeyError, TypeError) as e: + except (KeyError, TypeError) as e: # noqa: PERF203 logger.error("Decoding error:") logger.exception(e) logger.debug("Message %s", hexlify(m)) @@ -135,29 +133,28 @@ def collect_batch(self): type = msg[0] batch.append(msg) try: - if type == 'add_seeds': + if type == "add_seeds": _, seeds = msg self.states_context.to_fetch(seeds) continue - if type == 'page_crawled': + if type == "page_crawled": _, response = msg self.states_context.to_fetch(response) continue - if type == 'links_extracted': + if type == "links_extracted": _, request, links = msg self.states_context.to_fetch(request) self.states_context.to_fetch(links) continue - if type == 'request_error': + if type == "request_error": _, request, error = msg self.states_context.to_fetch(request) continue - if type == 'offset': + if type == "offset": continue self.collect_unknown_message(msg) except Exception as exc: logger.exception(exc) - pass finally: consumed += 1 return (batch, consumed) @@ -166,36 +163,46 @@ def process_batch(self, batch): for msg in batch: type = msg[0] try: - if type == 'add_seeds': + if type == "add_seeds": _, seeds = msg for seed in seeds: - seed.meta[b'jid'] = self.job_id + seed.meta[b"jid"] = self.job_id self.on_add_seeds(seeds) continue - if type == 'page_crawled': + if type == "page_crawled": _, response = msg - if b'jid' not in response.meta or response.meta[b'jid'] != self.job_id: + if ( + b"jid" not in response.meta + or response.meta[b"jid"] != self.job_id + ): continue self.on_page_crawled(response) continue - if type == 'links_extracted': + if type == "links_extracted": _, request, links = msg - if b'jid' not in request.meta or request.meta[b'jid'] != self.job_id: + if ( + b"jid" not in request.meta + or request.meta[b"jid"] != self.job_id + ): continue self.on_links_extracted(request, links) continue - if type == 'request_error': + if type == "request_error": _, request, error = msg - if b'jid' not in request.meta or request.meta[b'jid'] != self.job_id: + if ( + b"jid" not in request.meta + or request.meta[b"jid"] != self.job_id + ): continue self.on_request_error(request, error) continue self.on_unknown_message(msg) except Exception as exc: logger.exception(exc) - pass def work(self): + from twisted.internet import reactor + batch, consumed = self.collect_batch() self.states_context.fetch() self.process_batch(batch) @@ -210,15 +217,17 @@ def work(self): logger.info("Finishing.") reactor.callFromThread(reactor.stop) - self.stats['last_consumed'] = consumed - self.stats['last_consumption_run'] = asctime() - self.stats['consumed_since_start'] += consumed + self.stats["last_consumed"] = consumed + self.stats["last_consumption_run"] = asctime() + self.stats["consumed_since_start"] += consumed def run(self): + from twisted.internet import reactor + def log_failure(failure): logger.exception(failure.value) if failure.frames: - logger.critical(str("").join(format_tb(failure.getTracebackObject()))) + logger.critical("".join(format_tb(failure.getTracebackObject()))) def errback_main(failure): log_failure(failure) @@ -230,17 +239,17 @@ def errback_flush_states(failure): def debug(sig, frame): logger.critical("Signal received: printing stack trace") - logger.critical(str("").join(format_stack(frame))) + logger.critical("".join(format_stack(frame))) self.task.start(interval=0).addErrback(errback_main) self._logging_task.start(interval=30) self._flush_states_task.start(interval=300).addErrback(errback_flush_states) signal(SIGUSR1, debug) - reactor.addSystemEventTrigger('before', 'shutdown', self.stop) + reactor.addSystemEventTrigger("before", "shutdown", self.stop) reactor.run() def log_status(self): - for k, v in six.iteritems(self.stats): + for k, v in self.stats.items(): logger.info("%s=%s", k, v) def flush_states(self): @@ -253,7 +262,7 @@ def stop(self): self._manager.stop() def on_add_seeds(self, seeds): - logger.debug('Adding %i seeds', len(seeds)) + logger.debug("Adding %i seeds", len(seeds)) for seed in seeds: logger.debug("URL: %s", seed.url) self.states.set_states(seeds) @@ -283,30 +292,46 @@ def on_request_error(self, request, error): def setup_environment(): parser = ArgumentParser(description="Frontera strategy worker.") - parser.add_argument('--config', type=str, required=True, - help='Settings module name, should be accessible by import') - parser.add_argument('--log-level', '-L', type=str, default='INFO', - help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL") - parser.add_argument('--strategy', type=str, - help='Crawling strategy class path') - parser.add_argument('--partition-id', type=int, - help="Instance partition id.") + parser.add_argument( + "--config", + type=str, + required=True, + help="Settings module name, should be accessible by import", + ) + parser.add_argument( + "--log-level", + "-L", + type=str, + default="INFO", + help="Log level, for ex. DEBUG, INFO, WARN, ERROR, FATAL", + ) + parser.add_argument("--strategy", type=str, help="Crawling strategy class path") + parser.add_argument("--partition-id", type=int, help="Instance partition id.") args = parser.parse_args() settings = Settings(module=args.config) - strategy_classpath = args.strategy if args.strategy else settings.get('CRAWLING_STRATEGY') + strategy_classpath = ( + args.strategy if args.strategy else settings.get("CRAWLING_STRATEGY") + ) if not strategy_classpath: - raise ValueError("Couldn't locate strategy class path. Please supply it either using command line option or " - "settings file.") + raise ValueError( + "Couldn't locate strategy class path. Please supply it either using command line option or " + "settings file." + ) strategy_class = load_object(strategy_classpath) - partition_id = args.partition_id if args.partition_id is not None else settings.get('SCORING_PARTITION_ID') - if partition_id >= settings.get('SPIDER_LOG_PARTITIONS') or partition_id < 0: - raise ValueError("Partition id (%d) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." % - partition_id) - settings.set('SCORING_PARTITION_ID', partition_id) + partition_id = ( + args.partition_id + if args.partition_id is not None + else settings.get("SCORING_PARTITION_ID") + ) + if partition_id >= settings.get("SPIDER_LOG_PARTITIONS") or partition_id < 0: + raise ValueError( + f"Partition id ({partition_id}) cannot be less than zero or more than SPIDER_LOG_PARTITIONS." + ) + settings.set("SCORING_PARTITION_ID", partition_id) logging_config_path = settings.get("LOGGING_CONFIG") - if logging_config_path and exists(logging_config_path): + if logging_config_path and Path(logging_config_path).exists(): fileConfig(logging_config_path) else: logging.basicConfig(level=args.log_level) @@ -315,7 +340,7 @@ def setup_environment(): return settings, strategy_class -if __name__ == '__main__': +if __name__ == "__main__": settings, strategy_class = setup_environment() worker = StrategyWorker(settings, strategy_class) worker.run() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..7df061b33 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,157 @@ +[tool.bumpversion] +current_version = "0.7.1" +commit = true +tag = true + +[[tool.bumpversion.files]] +filename = 'CHANGES.rst' +search = "\\(unreleased\\)$" +replace = "({now:%Y-%m-%d})" +regex = true + +[[tool.bumpversion.files]] +filename = "docs/source/conf.py" +search = "version = [\"']\\d+\\.\\d+[\"']" +serialize = ["{major}.{minor}"] +replace = "version = \"{current_version}\"" +regex = true + +[[tool.bumpversion.files]] +filename = "docs/source/conf.py" + +[[tool.bumpversion.files]] +filename = "setup.py" +parse = "version\\s*=\\s*\"(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)" + +[tool.ruff.lint] +extend-select = [ + # flake8-bugbear + "B", + # flake8-comprehensions + "C4", + # pydocstyle + "D", + # flake8-future-annotations + "FA", + # flynt + "FLY", + # refurb + "FURB", + # isort + "I", + # flake8-implicit-str-concat + "ISC", + # flake8-logging + "LOG", + # Perflint + "PERF", + # pygrep-hooks + "PGH", + # flake8-pie + "PIE", + # pylint + "PL", + # flake8-use-pathlib + "PTH", + # flake8-pyi + "PYI", + # flake8-quotes + "Q", + # flake8-return + "RET", + # flake8-raise + "RSE", + # Ruff-specific rules + "RUF", + # flake8-bandit + "S", + # flake8-simplify + "SIM", + # flake8-slots + "SLOT", + # flake8-debugger + "T10", + # flake8-type-checking + "TC", + # flake8-tidy-imports + "TID", + # pyupgrade + "UP", + # pycodestyle warnings + "W", + # flake8-2020 + "YTT", +] +ignore = [ + # Missing docstring in public module + "D100", + # Missing docstring in public class + "D101", + # Missing docstring in public method + "D102", + # Missing docstring in public function + "D103", + # Missing docstring in public package + "D104", + # Missing docstring in magic method + "D105", + # Missing docstring in public nested class + "D106", + # Missing docstring in __init__ + "D107", + # One-line docstring should fit on one line with quotes + "D200", + # No blank lines allowed after function docstring + "D202", + # 1 blank line required between summary line and description + "D205", + # Multi-line docstring closing quotes should be on a separate line + "D209", + # First line should end with a period + "D400", + # First line should be in imperative mood; try rephrasing + "D401", + # First line should not be the function's "signature" + "D402", + # First word of the first line should be properly capitalized + "D403", + # Too many return statements + "PLR0911", + # Too many branches + "PLR0912", + # Too many arguments in function definition + "PLR0913", + # Too many statements + "PLR0915", + # Magic value used in comparison + "PLR2004", + # String contains ambiguous {}. + "RUF001", + # Docstring contains ambiguous {}. + "RUF002", + # Comment contains ambiguous {}. + "RUF003", + # Mutable class attributes should be annotated with `typing.ClassVar` + "RUF012", + # Use of `assert` detected + "S101", + # Using lxml to parse untrusted data is known to be vulnerable to XML attacks + "S320", +] + +[tool.ruff.lint.flake8-tidy-imports] +banned-module-level-imports = [ + "twisted.internet.reactor", +] + +[tool.ruff.lint.per-file-ignores] +# F403 (import *) is not important for examples, same as PERF (performance) and +# S (security). +"examples/*" = ["F403", "PERF", "S"] +# E402 (Module level import not at top of file) is skipped because of the use +# of pytest.importorskip. B904 (raise … from) and S (security) are not +# important for tests. +"tests/*" = ["B904", "E402", "S"] + +[tool.ruff.lint.pydocstyle] +convention = "pep257" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 7c718e7af..000000000 --- a/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -six>=1.8.0 -w3lib>=1.15.0 diff --git a/requirements/tests.txt b/requirements/tests.txt deleted file mode 100644 index 455cd0c35..000000000 --- a/requirements/tests.txt +++ /dev/null @@ -1,15 +0,0 @@ -pytest>=2.6.4 -PyMySQL>=0.6.3 -psycopg2>=2.5.4 -scrapy>=0.24 --r tldextract.txt -SQLAlchemy>=1.0.0 -cachetools -pyzmq -msgpack-python>=0.4 -kafka-python>=1.0.0 -pytest-cov -happybase>=1.0.0 -mock -boto>=2.42.0 --r logging.txt diff --git a/requirements/tldextract.txt b/requirements/tldextract.txt deleted file mode 100644 index c616d8715..000000000 --- a/requirements/tldextract.txt +++ /dev/null @@ -1 +0,0 @@ -tldextract>=1.5.1 diff --git a/setup.py b/setup.py index 5f305a258..87856c7be 100644 --- a/setup.py +++ b/setup.py @@ -1,89 +1,59 @@ -from setuptools import setup, find_packages - -import versioneer -versioneer.VCS = 'git' -versioneer.versionfile_source = 'frontera/_version.py' -versioneer.versionfile_build = 'frontera/_version.py' -versioneer.tag_prefix = 'v' # tags are like v1.2.0 -versioneer.parentdir_prefix = 'frontera-' - +from setuptools import find_packages, setup setup( - name='frontera', - version=versioneer.get_version(), - cmdclass=versioneer.get_cmdclass(), - packages=find_packages(exclude=('tests', 'tests.*', 'examples', 'examples.*')), - url='https://github.com/scrapinghub/frontera', - description='A scalable frontier for web crawlers', - author='Frontera developers', - maintainer='Alexander Sibiryakov', - maintainer_email='sibiryakov@scrapinghub.com', - license='BSD', + name="frontera", + version="0.7.1", + packages=find_packages(exclude=("tests", "tests.*", "examples", "examples.*")), + url="https://github.com/scrapinghub/frontera", + description="A scalable frontier for web crawlers", + author="Frontera developers", + maintainer="Alexander Sibiryakov", + maintainer_email="sibiryakov@scrapinghub.com", + license="BSD", include_package_data=True, zip_safe=False, - keywords=['crawler', 'frontier', 'scrapy', 'web', 'requests', 'frontera'], + keywords=["crawler", "frontier", "scrapy", "web", "requests", "frontera"], classifiers=[ - 'Development Status :: 4 - Beta', - 'Environment :: Console', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: BSD License', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Topic :: Internet :: WWW/HTTP', - 'Topic :: Software Development :: Libraries :: Application Frameworks', - 'Topic :: Software Development :: Libraries :: Python Modules', - ], - install_requires=[ - 'six>=1.8.0', - 'w3lib>=1.15.0' + "Development Status :: 4 - Beta", + "Environment :: Console", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Software Development :: Libraries :: Application Frameworks", + "Topic :: Software Development :: Libraries :: Python Modules", ], + install_requires=["w3lib>=1.17.0"], + python_requires=">=3.9", extras_require={ - 'sql': [ - 'SQLAlchemy>=1.0.0', - 'cachetools' - ], - 'graphs': [ - 'pyparsing==1.5.7', - 'pydot==1.0.28', - 'SQLAlchemy' - ], - 'logging': [ - 'colorlog>=2.4.0', - 'python-json-logger>=0.1.5' - ], - 'tldextract': [ - 'tldextract>=1.5.1', + "s3": [ + "boto>=2.49.0", ], - 'hbase': [ - 'happybase>=1.0.0' + "scrapy": [ + "scrapy>=2.7.0", ], - 'zeromq': [ - 'pyzmq', - 'msgpack-python>=0.4' + "sql": ["cachetools>=0.4.0", "SQLAlchemy>=1.0.0,<1.4"], + "graphs": ["pyparsing==1.5.7", "pydot==1.0.28", "SQLAlchemy"], + "logging": ["colorlog>=2.4.0", "python-json-logger>=0.1.5"], + "tldextract": [ + "tldextract>=1.5.1", ], - 'kafka': [ - 'kafka-python>=1.0.0' + "hbase": [ + "cachetools>=0.4.0", + "happybase>=1.2.0", + "msgpack-python>=0.4", + # https://github.com/python-happybase/happybase/pull/261 + "setuptools>=50.3.1", ], - 'distributed': [ - 'Twisted' - ] + "zeromq": ["pyzmq>=19.0.2", "msgpack-python>=0.4"], + "kafka": ["kafka-python>=1.4.3,<2.1", "twisted>=20.3.0"], + "distributed": ["Twisted"], }, - tests_require=[ - "pytest>=2.6.4", - "PyMySQL>=0.6.3", - "psycopg2>=2.5.4", - "scrapy>=0.24", - "tldextract>=1.5.1", - "SQLAlchemy>=1.0.0", - "cachetools", - "mock", - "boto>=2.42.0", - "colorlog>=2.4.0", - "python-json-logger>=0.1.5" - ] ) diff --git a/tests/__init__.py b/tests/__init__.py index 9b4dc7f3d..e69de29bb 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,4 +0,0 @@ -try: - import unittest.mock as mock -except ImportError: - import mock diff --git a/tests/backends.py b/tests/backends.py index f3cdab956..c4162bcbb 100644 --- a/tests/backends.py +++ b/tests/backends.py @@ -1,12 +1,11 @@ -from __future__ import absolute_import import pytest -from frontera import FrontierManager, Settings, FrontierTester +from frontera import FrontierManager, FrontierTester, Settings from frontera.utils import graphs from frontera.utils.tester import BaseDownloaderSimulator -class BackendTest(object): +class BackendTest: """ A simple pytest base class with helper methods for :class:`Backend ` testing. @@ -16,7 +15,7 @@ class BackendTest(object): def setup_method(self, method): if not self.backend_class: - pytest.fail('missing backend_class!') + pytest.fail("missing backend_class!") self.setup_backend(method) def teardown_method(self, method): @@ -26,13 +25,11 @@ def setup_backend(self, method): """ Setup method called before each test method call """ - pass def teardown_backend(self, method): """ Teardown method called after each test method call """ - pass def get_frontier(self): """ @@ -44,9 +41,7 @@ def get_settings(self): """ Returns backend settings """ - return Settings(attributes={ - 'BACKEND': self.backend_class - }) + return Settings(attributes={"BACKEND": self.backend_class}) TEST_SITES = { @@ -62,46 +57,64 @@ class BackendSequenceTest(BackendTest): A pytest base class for testing :class:`Backend ` crawling sequences. """ + def get_settings(self): - settings = super(BackendSequenceTest, self).get_settings() + settings = super().get_settings() settings.TEST_MODE = True settings.LOGGING_MANAGER_ENABLED = False settings.LOGGING_BACKEND_ENABLED = False settings.LOGGING_DEBUGGING_ENABLED = False return settings - def get_sequence(self, site_list, max_next_requests, downloader_simulator=BaseDownloaderSimulator(), - frontier_tester=FrontierTester): + def get_sequence( + self, + site_list, + max_next_requests, + downloader_simulator=None, + frontier_tester=FrontierTester, + ): """ Returns an Frontera iteration sequence from a site list :param list site_list: A list of sites to use as frontier seeds. :param int max_next_requests: Max next requests for the frontier. """ + if downloader_simulator is None: + downloader_simulator = BaseDownloaderSimulator() # Graph graph_manager = graphs.Manager() graph_manager.add_site_list(site_list) # Tester - tester = frontier_tester(frontier=self.get_frontier(), - graph_manager=graph_manager, - max_next_requests=max_next_requests, - downloader_simulator=downloader_simulator) + tester = frontier_tester( + frontier=self.get_frontier(), + graph_manager=graph_manager, + max_next_requests=max_next_requests, + downloader_simulator=downloader_simulator, + ) tester.run() return tester.sequence - def get_url_sequence(self, site_list, max_next_requests, downloader_simulator=BaseDownloaderSimulator(), - frontier_tester=FrontierTester): + def get_url_sequence( + self, + site_list, + max_next_requests, + downloader_simulator=None, + frontier_tester=FrontierTester, + ): """ Returns a crawling sequence from a site list :param list site_list: A list of sites to use as frontier seeds. :param int max_next_requests: Max next requests for the frontier. """ + if downloader_simulator is None: + downloader_simulator = BaseDownloaderSimulator() sequence = [] - for requests, iteration, dl_info in self.get_sequence(site_list, max_next_requests, downloader_simulator, - frontier_tester): + for requests, _iteration, _dl_info in self.get_sequence( + site_list, max_next_requests, downloader_simulator, frontier_tester + ): sequence.extend([r.url for r in requests]) return sequence @@ -115,7 +128,7 @@ def assert_sequence(self, site_list, expected_sequence, max_next_requests): # Get sequence sequence = self.get_url_sequence(site_list, max_next_requests) - #print [str(n) for n in sequence] + # print [str(n) for n in sequence] # Assert sequence equals expected assert len(sequence) == len(expected_sequence) @@ -123,52 +136,110 @@ def assert_sequence(self, site_list, expected_sequence, max_next_requests): class FIFOBackendTest(BackendSequenceTest): - EXPECTED_SEQUENCES = { "SEQUENCE_01_A": [ - 'A1', - 'A11', 'A12', - 'A111', 'A112', 'A121', 'A122', - 'A1111', 'A1112', 'A1121', 'A1122', 'A1211', 'A1212', 'A1221', 'A1222' + "A1", + "A11", + "A12", + "A111", + "A112", + "A121", + "A122", + "A1111", + "A1112", + "A1121", + "A1122", + "A1211", + "A1212", + "A1221", + "A1222", ], "SEQUENCE_02_A": [ - 'A1', 'B1', - 'A11', 'A12', 'B11', 'B12', - 'A111', 'A112', 'A121', 'A122', 'B111', 'B112', 'B121', 'B122', - 'A1111', 'A1112', 'A1121', 'A1122', 'A1211', 'A1212', 'A1221', 'A1222', - 'B1111', 'B1112', 'B1121', 'B1122', 'B1211', 'B1212', 'B1221', 'B1222' + "A1", + "B1", + "A11", + "A12", + "B11", + "B12", + "A111", + "A112", + "A121", + "A122", + "B111", + "B112", + "B121", + "B122", + "A1111", + "A1112", + "A1121", + "A1122", + "A1211", + "A1212", + "A1221", + "A1222", + "B1111", + "B1112", + "B1121", + "B1122", + "B1211", + "B1212", + "B1221", + "B1222", ], "SEQUENCE_03_A": [ - 'C1', - 'C11', 'C12', - 'C111', 'C112', 'C121', 'C122', - 'C1111', 'C1112', 'C1121', 'C1122', 'C1211', 'C1212', 'C1221', 'C1222', - 'C11111', 'C11112', 'C11121', 'C11122', 'C11211', 'C11212', 'C11221', 'C11222', - 'C12111', 'C12112', 'C12121', 'C12122', 'C12211', 'C12212', 'C12221', 'C12222' + "C1", + "C11", + "C12", + "C111", + "C112", + "C121", + "C122", + "C1111", + "C1112", + "C1121", + "C1122", + "C1211", + "C1212", + "C1221", + "C1222", + "C11111", + "C11112", + "C11121", + "C11122", + "C11211", + "C11212", + "C11221", + "C11222", + "C12111", + "C12112", + "C12121", + "C12122", + "C12211", + "C12212", + "C12221", + "C12222", ], } @pytest.mark.parametrize( - ('site_list', 'max_next_requests', 'expected_sequence'), [ - - ('SITE_01', 1, 'SEQUENCE_01_A'), - ('SITE_01', 2, 'SEQUENCE_01_A'), - ('SITE_01', 5, 'SEQUENCE_01_A'), - ('SITE_01', 10, 'SEQUENCE_01_A'), - ('SITE_01', 100, 'SEQUENCE_01_A'), - - ('SITE_02', 1, 'SEQUENCE_02_A'), - ('SITE_02', 2, 'SEQUENCE_02_A'), - ('SITE_02', 5, 'SEQUENCE_02_A'), - ('SITE_02', 10, 'SEQUENCE_02_A'), - ('SITE_02', 100, 'SEQUENCE_02_A'), - - ('SITE_03', 1, 'SEQUENCE_03_A'), - ('SITE_03', 2, 'SEQUENCE_03_A'), - ('SITE_03', 5, 'SEQUENCE_03_A'), - ('SITE_03', 10, 'SEQUENCE_03_A'), - ('SITE_03', 100, 'SEQUENCE_03_A'), - ] + ("site_list", "max_next_requests", "expected_sequence"), + [ + ("SITE_01", 1, "SEQUENCE_01_A"), + ("SITE_01", 2, "SEQUENCE_01_A"), + ("SITE_01", 5, "SEQUENCE_01_A"), + ("SITE_01", 10, "SEQUENCE_01_A"), + ("SITE_01", 100, "SEQUENCE_01_A"), + ("SITE_02", 1, "SEQUENCE_02_A"), + ("SITE_02", 2, "SEQUENCE_02_A"), + ("SITE_02", 5, "SEQUENCE_02_A"), + ("SITE_02", 10, "SEQUENCE_02_A"), + ("SITE_02", 100, "SEQUENCE_02_A"), + ("SITE_03", 1, "SEQUENCE_03_A"), + ("SITE_03", 2, "SEQUENCE_03_A"), + ("SITE_03", 5, "SEQUENCE_03_A"), + ("SITE_03", 10, "SEQUENCE_03_A"), + ("SITE_03", 100, "SEQUENCE_03_A"), + ], ) def test_sequence(self, site_list, max_next_requests, expected_sequence): self.assert_sequence( @@ -179,124 +250,339 @@ def test_sequence(self, site_list, max_next_requests, expected_sequence): class LIFOBackendTest(BackendSequenceTest): - EXPECTED_SEQUENCES = { "SEQUENCE_01_A": [ - 'A1', - 'A12', - 'A122', 'A1222', 'A1221', - 'A121', 'A1212', 'A1211', - 'A11', - 'A112', 'A1122', 'A1121', - 'A111', 'A1112', 'A1111' + "A1", + "A12", + "A122", + "A1222", + "A1221", + "A121", + "A1212", + "A1211", + "A11", + "A112", + "A1122", + "A1121", + "A111", + "A1112", + "A1111", ], "SEQUENCE_01_B": [ - 'A1', - 'A12', 'A11', - 'A112', 'A111', - 'A1112', 'A1111', 'A1122', 'A1121', - 'A122', 'A121', - 'A1212', 'A1211', 'A1222', 'A1221'], + "A1", + "A12", + "A11", + "A112", + "A111", + "A1112", + "A1111", + "A1122", + "A1121", + "A122", + "A121", + "A1212", + "A1211", + "A1222", + "A1221", + ], "SEQUENCE_01_C": [ - 'A1', - 'A12', 'A11', - 'A112', 'A111', 'A122', 'A121', - 'A1212', 'A1211', 'A1222', 'A1221', 'A1112', 'A1111', 'A1122', 'A1121' + "A1", + "A12", + "A11", + "A112", + "A111", + "A122", + "A121", + "A1212", + "A1211", + "A1222", + "A1221", + "A1112", + "A1111", + "A1122", + "A1121", ], "SEQUENCE_02_A": [ - 'B1', - 'B12', 'B122', 'B1222', 'B1221', 'B121', 'B1212', 'B1211', - 'B11', 'B112', 'B1122', 'B1121', 'B111', 'B1112', 'B1111', - 'A1', - 'A12', 'A122', 'A1222', 'A1221', 'A121', 'A1212', 'A1211', - 'A11', 'A112', 'A1122', 'A1121', 'A111', 'A1112', 'A1111' + "B1", + "B12", + "B122", + "B1222", + "B1221", + "B121", + "B1212", + "B1211", + "B11", + "B112", + "B1122", + "B1121", + "B111", + "B1112", + "B1111", + "A1", + "A12", + "A122", + "A1222", + "A1221", + "A121", + "A1212", + "A1211", + "A11", + "A112", + "A1122", + "A1121", + "A111", + "A1112", + "A1111", ], "SEQUENCE_02_B": [ - 'B1', 'A1', - 'A12', 'A11', - 'A112', 'A111', - 'A1112', 'A1111', 'A1122', 'A1121', - 'A122', 'A121', - 'A1212', 'A1211', 'A1222', 'A1221', - 'B12', 'B11', - 'B112', 'B111', - 'B1112', 'B1111', 'B1122', 'B1121', - 'B122', 'B121', - 'B1212', 'B1211', 'B1222', 'B1221' + "B1", + "A1", + "A12", + "A11", + "A112", + "A111", + "A1112", + "A1111", + "A1122", + "A1121", + "A122", + "A121", + "A1212", + "A1211", + "A1222", + "A1221", + "B12", + "B11", + "B112", + "B111", + "B1112", + "B1111", + "B1122", + "B1121", + "B122", + "B121", + "B1212", + "B1211", + "B1222", + "B1221", ], "SEQUENCE_02_C": [ - 'B1', 'A1', - 'A12', 'A11', 'B12', 'B11', 'B112', 'B111', 'B122', 'B121', 'A112', - 'A1122', 'A1121', 'B1212', 'B1211', 'B1222', 'B1221', 'B1112', 'B1111', 'B1122', 'B1121', - 'A111', 'A122', 'A121', - 'A1212', 'A1211', 'A1222', 'A1221', 'A1112', 'A1111' + "B1", + "A1", + "A12", + "A11", + "B12", + "B11", + "B112", + "B111", + "B122", + "B121", + "A112", + "A1122", + "A1121", + "B1212", + "B1211", + "B1222", + "B1221", + "B1112", + "B1111", + "B1122", + "B1121", + "A111", + "A122", + "A121", + "A1212", + "A1211", + "A1222", + "A1221", + "A1112", + "A1111", ], "SEQUENCE_02_D": [ - 'B1', 'A1', - 'A12', 'A11', 'B12', 'B11', 'B112', 'B111', 'B122', 'B121', 'A112', 'A111', 'A122', 'A121', - 'A1212', 'A1211', 'A1222', 'A1221', 'A1112', 'A1111', 'A1122', 'A1121', - 'B1212', 'B1211', 'B1222', 'B1221', 'B1112', 'B1111', 'B1122', 'B1121' + "B1", + "A1", + "A12", + "A11", + "B12", + "B11", + "B112", + "B111", + "B122", + "B121", + "A112", + "A111", + "A122", + "A121", + "A1212", + "A1211", + "A1222", + "A1221", + "A1112", + "A1111", + "A1122", + "A1121", + "B1212", + "B1211", + "B1222", + "B1221", + "B1112", + "B1111", + "B1122", + "B1121", ], - "SEQUENCE_03_A": [ - 'C1', 'C12', 'C122', 'C1222', 'C12222', 'C12221', 'C1221', 'C12212', 'C12211', - 'C121', 'C1212', 'C12122', 'C12121', 'C1211', 'C12112', 'C12111', - 'C11', 'C112', 'C1122', 'C11222', 'C11221', 'C1121', 'C11212', 'C11211', - 'C111', 'C1112', 'C11122', 'C11121', 'C1111', 'C11112', 'C11111' + "C1", + "C12", + "C122", + "C1222", + "C12222", + "C12221", + "C1221", + "C12212", + "C12211", + "C121", + "C1212", + "C12122", + "C12121", + "C1211", + "C12112", + "C12111", + "C11", + "C112", + "C1122", + "C11222", + "C11221", + "C1121", + "C11212", + "C11211", + "C111", + "C1112", + "C11122", + "C11121", + "C1111", + "C11112", + "C11111", ], "SEQUENCE_03_B": [ - 'C1', - 'C12', 'C11', - 'C112', 'C111', - 'C1112', 'C1111', 'C11112', 'C11111', 'C11122', 'C11121', - 'C1122', 'C1121', 'C11212', 'C11211', 'C11222', 'C11221', - 'C122', 'C121', - 'C1212', 'C1211', 'C12112', 'C12111', 'C12122', 'C12121', - 'C1222', 'C1221', 'C12212', 'C12211', 'C12222', 'C12221' + "C1", + "C12", + "C11", + "C112", + "C111", + "C1112", + "C1111", + "C11112", + "C11111", + "C11122", + "C11121", + "C1122", + "C1121", + "C11212", + "C11211", + "C11222", + "C11221", + "C122", + "C121", + "C1212", + "C1211", + "C12112", + "C12111", + "C12122", + "C12121", + "C1222", + "C1221", + "C12212", + "C12211", + "C12222", + "C12221", ], "SEQUENCE_03_C": [ - 'C1', - 'C12', 'C11', - 'C112', 'C111', 'C122', 'C121', - 'C1212', 'C1211', 'C1222', 'C1221', 'C1112', - 'C11122', 'C11121', 'C12212', 'C12211', - 'C12222', 'C12221', 'C12112', 'C12111', - 'C12122', 'C12121', - 'C1111', 'C1122', 'C1121', 'C11212', - 'C11211', 'C11222', 'C11221', 'C11112', 'C11111' + "C1", + "C12", + "C11", + "C112", + "C111", + "C122", + "C121", + "C1212", + "C1211", + "C1222", + "C1221", + "C1112", + "C11122", + "C11121", + "C12212", + "C12211", + "C12222", + "C12221", + "C12112", + "C12111", + "C12122", + "C12121", + "C1111", + "C1122", + "C1121", + "C11212", + "C11211", + "C11222", + "C11221", + "C11112", + "C11111", ], "SEQUENCE_03_D": [ - 'C1', - 'C12', 'C11', - 'C112', 'C111', 'C122', 'C121', - 'C1212', 'C1211', 'C1222', 'C1221', - 'C1112', 'C1111', 'C1122', 'C1121', - 'C11212', 'C11211', 'C11222', 'C11221', 'C11112', 'C11111', 'C11122', 'C11121', - 'C12212', 'C12211', 'C12222', 'C12221', 'C12112', 'C12111', 'C12122', 'C12121' + "C1", + "C12", + "C11", + "C112", + "C111", + "C122", + "C121", + "C1212", + "C1211", + "C1222", + "C1221", + "C1112", + "C1111", + "C1122", + "C1121", + "C11212", + "C11211", + "C11222", + "C11221", + "C11112", + "C11111", + "C11122", + "C11121", + "C12212", + "C12211", + "C12222", + "C12221", + "C12112", + "C12111", + "C12122", + "C12121", ], } @pytest.mark.parametrize( - ('site_list', 'max_next_requests', 'expected_sequence'), [ - - ('SITE_01', 1, 'SEQUENCE_01_A'), - ('SITE_01', 2, 'SEQUENCE_01_B'), - ('SITE_01', 5, 'SEQUENCE_01_C'), - ('SITE_01', 10, 'SEQUENCE_01_C'), - ('SITE_01', 100, 'SEQUENCE_01_C'), - - ('SITE_02', 1, 'SEQUENCE_02_A'), - ('SITE_02', 2, 'SEQUENCE_02_B'), - ('SITE_02', 5, 'SEQUENCE_02_C'), - ('SITE_02', 10, 'SEQUENCE_02_D'), - ('SITE_02', 100, 'SEQUENCE_02_D'), - - ('SITE_03', 1, 'SEQUENCE_03_A'), - ('SITE_03', 2, 'SEQUENCE_03_B'), - ('SITE_03', 5, 'SEQUENCE_03_C'), - ('SITE_03', 10, 'SEQUENCE_03_D'), - ('SITE_03', 100, 'SEQUENCE_03_D'), - ] + ("site_list", "max_next_requests", "expected_sequence"), + [ + ("SITE_01", 1, "SEQUENCE_01_A"), + ("SITE_01", 2, "SEQUENCE_01_B"), + ("SITE_01", 5, "SEQUENCE_01_C"), + ("SITE_01", 10, "SEQUENCE_01_C"), + ("SITE_01", 100, "SEQUENCE_01_C"), + ("SITE_02", 1, "SEQUENCE_02_A"), + ("SITE_02", 2, "SEQUENCE_02_B"), + ("SITE_02", 5, "SEQUENCE_02_C"), + ("SITE_02", 10, "SEQUENCE_02_D"), + ("SITE_02", 100, "SEQUENCE_02_D"), + ("SITE_03", 1, "SEQUENCE_03_A"), + ("SITE_03", 2, "SEQUENCE_03_B"), + ("SITE_03", 5, "SEQUENCE_03_C"), + ("SITE_03", 10, "SEQUENCE_03_D"), + ("SITE_03", 100, "SEQUENCE_03_D"), + ], ) def test_sequence(self, site_list, max_next_requests, expected_sequence): self.assert_sequence( @@ -307,135 +593,339 @@ def test_sequence(self, site_list, max_next_requests, expected_sequence): class DFSBackendTest(BackendSequenceTest): - EXPECTED_SEQUENCES = { "SEQUENCE_01_A": [ - 'A1', - 'A11', 'A111', 'A1111', 'A1112', 'A112', 'A1121', 'A1122', - 'A12', 'A121', 'A1211', 'A1212', 'A122', 'A1221', 'A1222' + "A1", + "A11", + "A111", + "A1111", + "A1112", + "A112", + "A1121", + "A1122", + "A12", + "A121", + "A1211", + "A1212", + "A122", + "A1221", + "A1222", ], "SEQUENCE_01_B": [ - 'A1', - 'A11', 'A12', - 'A111', 'A112', - 'A1111', 'A1112', 'A1121', 'A1122', - 'A121', 'A122', - 'A1211', 'A1212', 'A1221', 'A1222' + "A1", + "A11", + "A12", + "A111", + "A112", + "A1111", + "A1112", + "A1121", + "A1122", + "A121", + "A122", + "A1211", + "A1212", + "A1221", + "A1222", ], "SEQUENCE_01_C": [ - 'A1', - 'A11', 'A12', - 'A111', 'A112', 'A121', 'A122', - 'A1111', 'A1112', 'A1121', 'A1122', 'A1211', 'A1212', 'A1221', 'A1222' + "A1", + "A11", + "A12", + "A111", + "A112", + "A121", + "A122", + "A1111", + "A1112", + "A1121", + "A1122", + "A1211", + "A1212", + "A1221", + "A1222", ], "SEQUENCE_02_A": [ - 'A1', - 'A11', - 'A111', 'A1111', 'A1112', - 'A112', 'A1121', 'A1122', - 'A12', - 'A121', 'A1211', 'A1212', - 'A122', 'A1221', 'A1222', - 'B1', - 'B11', - 'B111', 'B1111', 'B1112', - 'B112', 'B1121', 'B1122', - 'B12', - 'B121', 'B1211', 'B1212', - 'B122', 'B1221', 'B1222' + "A1", + "A11", + "A111", + "A1111", + "A1112", + "A112", + "A1121", + "A1122", + "A12", + "A121", + "A1211", + "A1212", + "A122", + "A1221", + "A1222", + "B1", + "B11", + "B111", + "B1111", + "B1112", + "B112", + "B1121", + "B1122", + "B12", + "B121", + "B1211", + "B1212", + "B122", + "B1221", + "B1222", ], "SEQUENCE_02_B": [ - 'A1', 'B1', - 'A11', 'A12', - 'A111', 'A112', - 'A1111', 'A1112', 'A1121', 'A1122', - 'A121', 'A122', - 'A1211', 'A1212', 'A1221', 'A1222', - 'B11', 'B12', - 'B111', 'B112', - 'B1111', 'B1112', 'B1121', 'B1122', - 'B121', 'B122', - 'B1211', 'B1212', 'B1221', 'B1222' + "A1", + "B1", + "A11", + "A12", + "A111", + "A112", + "A1111", + "A1112", + "A1121", + "A1122", + "A121", + "A122", + "A1211", + "A1212", + "A1221", + "A1222", + "B11", + "B12", + "B111", + "B112", + "B1111", + "B1112", + "B1121", + "B1122", + "B121", + "B122", + "B1211", + "B1212", + "B1221", + "B1222", ], "SEQUENCE_02_C": [ - 'A1', 'B1', - 'A11', 'A12', 'B11', 'B12', - 'A111', 'A112', 'A121', 'A122', 'B111', - 'A1111', 'A1112', 'A1121', 'A1122', 'A1211', 'A1212', 'A1221', 'A1222', 'B1111', 'B1112', - 'B112', 'B121', 'B122', - 'B1121', 'B1122', 'B1211', 'B1212', 'B1221', 'B1222' + "A1", + "B1", + "A11", + "A12", + "B11", + "B12", + "A111", + "A112", + "A121", + "A122", + "B111", + "A1111", + "A1112", + "A1121", + "A1122", + "A1211", + "A1212", + "A1221", + "A1222", + "B1111", + "B1112", + "B112", + "B121", + "B122", + "B1121", + "B1122", + "B1211", + "B1212", + "B1221", + "B1222", ], "SEQUENCE_02_D": [ - 'A1', 'B1', - 'A11', 'A12', 'B11', 'B12', - 'A111', 'A112', 'A121', 'A122', - 'B111', 'B112', 'B121', 'B122', - 'A1111', 'A1112', 'A1121', 'A1122', 'A1211', 'A1212', 'A1221', 'A1222', - 'B1111', 'B1112', 'B1121', 'B1122', 'B1211', 'B1212', 'B1221', 'B1222' + "A1", + "B1", + "A11", + "A12", + "B11", + "B12", + "A111", + "A112", + "A121", + "A122", + "B111", + "B112", + "B121", + "B122", + "A1111", + "A1112", + "A1121", + "A1122", + "A1211", + "A1212", + "A1221", + "A1222", + "B1111", + "B1112", + "B1121", + "B1122", + "B1211", + "B1212", + "B1221", + "B1222", ], "SEQUENCE_03_A": [ - 'C1', - 'C11', - 'C111', 'C1111', 'C11111', 'C11112', 'C1112', 'C11121', 'C11122', - 'C112', 'C1121', 'C11211', 'C11212', 'C1122', 'C11221', 'C11222', - 'C12', - 'C121', 'C1211', 'C12111', 'C12112', 'C1212', 'C12121', 'C12122', - 'C122', 'C1221', 'C12211', 'C12212', 'C1222', 'C12221', 'C12222' + "C1", + "C11", + "C111", + "C1111", + "C11111", + "C11112", + "C1112", + "C11121", + "C11122", + "C112", + "C1121", + "C11211", + "C11212", + "C1122", + "C11221", + "C11222", + "C12", + "C121", + "C1211", + "C12111", + "C12112", + "C1212", + "C12121", + "C12122", + "C122", + "C1221", + "C12211", + "C12212", + "C1222", + "C12221", + "C12222", ], "SEQUENCE_03_B": [ - 'C1', - 'C11', 'C12', - 'C111', 'C112', - 'C1111', 'C1112', - 'C11111', 'C11112', 'C11121', 'C11122', - 'C1121', 'C1122', - 'C11211', 'C11212', 'C11221', 'C11222', - 'C121', 'C122', - 'C1211', 'C1212', - 'C12111', 'C12112', 'C12121', 'C12122', - 'C1221', 'C1222', - 'C12211', 'C12212', 'C12221', 'C12222' + "C1", + "C11", + "C12", + "C111", + "C112", + "C1111", + "C1112", + "C11111", + "C11112", + "C11121", + "C11122", + "C1121", + "C1122", + "C11211", + "C11212", + "C11221", + "C11222", + "C121", + "C122", + "C1211", + "C1212", + "C12111", + "C12112", + "C12121", + "C12122", + "C1221", + "C1222", + "C12211", + "C12212", + "C12221", + "C12222", ], "SEQUENCE_03_C": [ - 'C1', - 'C11', 'C12', - 'C111', 'C112', 'C121', 'C122', - 'C1111', 'C1112', 'C1121', 'C1122', 'C1211', - 'C11111', 'C11112', 'C11121', 'C11122', 'C11211', 'C11212', 'C11221', 'C11222', 'C12111', 'C12112', - 'C1212', 'C1221', 'C1222', - 'C12121', 'C12122', 'C12211', 'C12212', 'C12221', 'C12222' + "C1", + "C11", + "C12", + "C111", + "C112", + "C121", + "C122", + "C1111", + "C1112", + "C1121", + "C1122", + "C1211", + "C11111", + "C11112", + "C11121", + "C11122", + "C11211", + "C11212", + "C11221", + "C11222", + "C12111", + "C12112", + "C1212", + "C1221", + "C1222", + "C12121", + "C12122", + "C12211", + "C12212", + "C12221", + "C12222", ], "SEQUENCE_03_D": [ - 'C1', - 'C11', 'C12', - 'C111', 'C112', 'C121', 'C122', - 'C1111', 'C1112', 'C1121', 'C1122', 'C1211', 'C1212', 'C1221', 'C1222', - 'C11111', 'C11112', 'C11121', 'C11122', 'C11211', 'C11212', 'C11221', 'C11222', - 'C12111', 'C12112', 'C12121', 'C12122', 'C12211', 'C12212', 'C12221', 'C12222' + "C1", + "C11", + "C12", + "C111", + "C112", + "C121", + "C122", + "C1111", + "C1112", + "C1121", + "C1122", + "C1211", + "C1212", + "C1221", + "C1222", + "C11111", + "C11112", + "C11121", + "C11122", + "C11211", + "C11212", + "C11221", + "C11222", + "C12111", + "C12112", + "C12121", + "C12122", + "C12211", + "C12212", + "C12221", + "C12222", ], } @pytest.mark.parametrize( - ('site_list', 'max_next_requests', 'expected_sequence'), [ - - ('SITE_01', 1, 'SEQUENCE_01_A'), - ('SITE_01', 2, 'SEQUENCE_01_B'), - ('SITE_01', 5, 'SEQUENCE_01_C'), - ('SITE_01', 10, 'SEQUENCE_01_C'), - ('SITE_01', 100, 'SEQUENCE_01_C'), - - ('SITE_02', 1, 'SEQUENCE_02_A'), - ('SITE_02', 2, 'SEQUENCE_02_B'), - ('SITE_02', 5, 'SEQUENCE_02_C'), - ('SITE_02', 10, 'SEQUENCE_02_D'), - ('SITE_02', 100, 'SEQUENCE_02_D'), - - ('SITE_03', 1, 'SEQUENCE_03_A'), - ('SITE_03', 2, 'SEQUENCE_03_B'), - ('SITE_03', 5, 'SEQUENCE_03_C'), - ('SITE_03', 10, 'SEQUENCE_03_D'), - ('SITE_03', 100, 'SEQUENCE_03_D'), - ] + ("site_list", "max_next_requests", "expected_sequence"), + [ + ("SITE_01", 1, "SEQUENCE_01_A"), + ("SITE_01", 2, "SEQUENCE_01_B"), + ("SITE_01", 5, "SEQUENCE_01_C"), + ("SITE_01", 10, "SEQUENCE_01_C"), + ("SITE_01", 100, "SEQUENCE_01_C"), + ("SITE_02", 1, "SEQUENCE_02_A"), + ("SITE_02", 2, "SEQUENCE_02_B"), + ("SITE_02", 5, "SEQUENCE_02_C"), + ("SITE_02", 10, "SEQUENCE_02_D"), + ("SITE_02", 100, "SEQUENCE_02_D"), + ("SITE_03", 1, "SEQUENCE_03_A"), + ("SITE_03", 2, "SEQUENCE_03_B"), + ("SITE_03", 5, "SEQUENCE_03_C"), + ("SITE_03", 10, "SEQUENCE_03_D"), + ("SITE_03", 100, "SEQUENCE_03_D"), + ], ) def test_sequence(self, site_list, max_next_requests, expected_sequence): self.assert_sequence( @@ -446,53 +936,110 @@ def test_sequence(self, site_list, max_next_requests, expected_sequence): class BFSBackendTest(BackendSequenceTest): - EXPECTED_SEQUENCES = { "SEQUENCE_01_A": [ - 'A1', - 'A11', 'A12', - 'A111', 'A112', 'A121', 'A122', - 'A1111', 'A1112', 'A1121', 'A1122', - 'A1211', 'A1212', 'A1221', 'A1222' + "A1", + "A11", + "A12", + "A111", + "A112", + "A121", + "A122", + "A1111", + "A1112", + "A1121", + "A1122", + "A1211", + "A1212", + "A1221", + "A1222", ], "SEQUENCE_02_A": [ - 'A1', 'B1', - 'A11', 'A12', 'B11', 'B12', - 'A111', 'A112', 'A121', 'A122', 'B111', 'B112', 'B121', 'B122', - 'A1111', 'A1112', 'A1121', 'A1122', 'A1211', 'A1212', 'A1221', 'A1222', - 'B1111', 'B1112', 'B1121', 'B1122', 'B1211', 'B1212', 'B1221', 'B1222' + "A1", + "B1", + "A11", + "A12", + "B11", + "B12", + "A111", + "A112", + "A121", + "A122", + "B111", + "B112", + "B121", + "B122", + "A1111", + "A1112", + "A1121", + "A1122", + "A1211", + "A1212", + "A1221", + "A1222", + "B1111", + "B1112", + "B1121", + "B1122", + "B1211", + "B1212", + "B1221", + "B1222", ], "SEQUENCE_03_A": [ - 'C1', - 'C11', 'C12', - 'C111', 'C112', 'C121', 'C122', - 'C1111', 'C1112', 'C1121', 'C1122', 'C1211', 'C1212', 'C1221', 'C1222', - 'C11111', 'C11112', 'C11121', 'C11122', 'C11211', 'C11212', 'C11221', 'C11222', - 'C12111', 'C12112', 'C12121', 'C12122', 'C12211', 'C12212', 'C12221', 'C12222' + "C1", + "C11", + "C12", + "C111", + "C112", + "C121", + "C122", + "C1111", + "C1112", + "C1121", + "C1122", + "C1211", + "C1212", + "C1221", + "C1222", + "C11111", + "C11112", + "C11121", + "C11122", + "C11211", + "C11212", + "C11221", + "C11222", + "C12111", + "C12112", + "C12121", + "C12122", + "C12211", + "C12212", + "C12221", + "C12222", ], } @pytest.mark.parametrize( - ('site_list', 'max_next_requests', 'expected_sequence'), [ - - ('SITE_01', 1, 'SEQUENCE_01_A'), - ('SITE_01', 2, 'SEQUENCE_01_A'), - ('SITE_01', 5, 'SEQUENCE_01_A'), - ('SITE_01', 10, 'SEQUENCE_01_A'), - ('SITE_01', 100, 'SEQUENCE_01_A'), - - ('SITE_02', 1, 'SEQUENCE_02_A'), - ('SITE_02', 2, 'SEQUENCE_02_A'), - ('SITE_02', 5, 'SEQUENCE_02_A'), - ('SITE_02', 10, 'SEQUENCE_02_A'), - ('SITE_02', 100, 'SEQUENCE_02_A'), - - ('SITE_03', 1, 'SEQUENCE_03_A'), - ('SITE_03', 2, 'SEQUENCE_03_A'), - ('SITE_03', 5, 'SEQUENCE_03_A'), - ('SITE_03', 10, 'SEQUENCE_03_A'), - ('SITE_03', 100, 'SEQUENCE_03_A'), - ] + ("site_list", "max_next_requests", "expected_sequence"), + [ + ("SITE_01", 1, "SEQUENCE_01_A"), + ("SITE_01", 2, "SEQUENCE_01_A"), + ("SITE_01", 5, "SEQUENCE_01_A"), + ("SITE_01", 10, "SEQUENCE_01_A"), + ("SITE_01", 100, "SEQUENCE_01_A"), + ("SITE_02", 1, "SEQUENCE_02_A"), + ("SITE_02", 2, "SEQUENCE_02_A"), + ("SITE_02", 5, "SEQUENCE_02_A"), + ("SITE_02", 10, "SEQUENCE_02_A"), + ("SITE_02", 100, "SEQUENCE_02_A"), + ("SITE_03", 1, "SEQUENCE_03_A"), + ("SITE_03", 2, "SEQUENCE_03_A"), + ("SITE_03", 5, "SEQUENCE_03_A"), + ("SITE_03", 10, "SEQUENCE_03_A"), + ("SITE_03", 100, "SEQUENCE_03_A"), + ], ) def test_sequence(self, site_list, max_next_requests, expected_sequence): self.assert_sequence( @@ -503,28 +1050,25 @@ def test_sequence(self, site_list, max_next_requests, expected_sequence): class RANDOMBackendTest(BackendSequenceTest): - @pytest.mark.parametrize( - ('site_list', 'max_next_requests'), [ - - ('SITE_01', 1), - ('SITE_01', 2), - ('SITE_01', 5), - ('SITE_01', 10), - ('SITE_01', 100), - - ('SITE_02', 1), - ('SITE_02', 2), - ('SITE_02', 5), - ('SITE_02', 10), - ('SITE_02', 100), - - ('SITE_03', 1), - ('SITE_03', 2), - ('SITE_03', 5), - ('SITE_03', 10), - ('SITE_03', 100), - ] + ("site_list", "max_next_requests"), + [ + ("SITE_01", 1), + ("SITE_01", 2), + ("SITE_01", 5), + ("SITE_01", 10), + ("SITE_01", 100), + ("SITE_02", 1), + ("SITE_02", 2), + ("SITE_02", 5), + ("SITE_02", 10), + ("SITE_02", 100), + ("SITE_03", 1), + ("SITE_03", 2), + ("SITE_03", 5), + ("SITE_03", 10), + ("SITE_03", 100), + ], ) def test_sequence(self, site_list, max_next_requests): sequence = self.get_url_sequence( diff --git a/tests/contrib/__init__.py b/tests/contrib/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/contrib/backends/__init__.py b/tests/contrib/backends/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/contrib/backends/hbase/test_hbase.py b/tests/contrib/backends/hbase/test_hbase.py index e0a039fd1..6c6091afa 100644 --- a/tests/contrib/backends/hbase/test_hbase.py +++ b/tests/contrib/backends/hbase/test_hbase.py @@ -1,110 +1,169 @@ -from __future__ import absolute_import +import pytest + +pytest.importorskip("happybase") + +from binascii import unhexlify +from time import time +from unittest import TestCase, mock + from happybase import Connection from Hbase_thrift import AlreadyExists # module loaded at runtime in happybase -from frontera.contrib.backends.hbase import HBaseState, HBaseMetadata, HBaseQueue -from frontera.core.models import Request, Response +from thriftpy2.transport import TTransportException +from w3lib.util import to_unicode + +from frontera.contrib.backends.hbase import HBaseMetadata, HBaseQueue, HBaseState from frontera.core.components import States -from binascii import unhexlify -from time import time -from w3lib.util import to_native_str -from tests import mock +from frontera.core.models import Request, Response -r1 = Request('https://www.example.com', meta={b'fingerprint': b'10', - b'domain': {b'name': b'www.example.com', b'fingerprint': b'81'}}) -r2 = Request('http://example.com/some/page/', meta={b'fingerprint': b'11', - b'domain': {b'name': b'example.com', b'fingerprint': b'82'}}) -r3 = Request('http://www.scrapy.org', meta={b'fingerprint': b'12', - b'domain': {b'name': b'www.scrapy.org', b'fingerprint': b'83'}}) +r1 = Request( + "https://www.example.com", + meta={ + b"fingerprint": b"10", + b"domain": {b"name": b"www.example.com", b"fingerprint": b"81"}, + }, +) +r2 = Request( + "http://example.com/some/page/", + meta={ + b"fingerprint": b"11", + b"domain": {b"name": b"example.com", b"fingerprint": b"82"}, + }, +) +r3 = Request( + "http://www.scrapy.org", + meta={ + b"fingerprint": b"12", + b"domain": {b"name": b"www.scrapy.org", b"fingerprint": b"83"}, + }, +) r4 = r3.copy() -class TestHBaseBackend(object): - +class TestHBaseBackend(TestCase): def delete_rows(self, table, row_keys): batch = table.batch() for key in row_keys: batch.delete(unhexlify(key)) batch.send() + def get_connection(self): + try: + return Connection(host="hbase-docker", port=9090) + except TTransportException: + raise self.skipTest("No running hbase-docker image") + def test_metadata(self): - connection = Connection(host='hbase-docker', port=9090) - metadata = HBaseMetadata(connection, b'metadata', True, False, 300000, True) + connection = self.get_connection() + metadata = HBaseMetadata(connection, b"metadata", True, False, 300000, True) metadata.add_seeds([r1, r2, r3]) - resp = Response('https://www.example.com', request=r1) + resp = Response("https://www.example.com", request=r1) metadata.page_crawled(resp) metadata.links_extracted(resp.request, [r2, r3]) - metadata.request_error(r4, 'error') + metadata.request_error(r4, "error") metadata.frontier_stop() - table = connection.table('metadata') - assert set([to_native_str(data[b'm:url'], 'utf-8') for _, data in table.scan()]) == \ - set([r1.url, r2.url, r3.url]) - self.delete_rows(table, [b'10', b'11', b'12']) + table = connection.table("metadata") + assert {to_unicode(data[b"m:url"], "utf-8") for _, data in table.scan()} == { + r1.url, + r2.url, + r3.url, + } + self.delete_rows(table, [b"10", b"11", b"12"]) def test_queue(self): - connection = Connection(host='hbase-docker', port=9090) - queue = HBaseQueue(connection, 2, b'queue', True) - batch = [('10', 0.5, r1, True), ('11', 0.6, r2, True), - ('12', 0.7, r3, True)] + connection = self.get_connection() + queue = HBaseQueue(connection, 2, b"queue", True) + batch = [("10", 0.5, r1, True), ("11", 0.6, r2, True), ("12", 0.7, r3, True)] queue.schedule(batch) - assert set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, - max_requests_per_host=10)]) == set([r3.url]) - assert set([r.url for r in queue.get_next_requests(10, 1, min_requests=3, min_hosts=1, - max_requests_per_host=10)]) == set([r1.url, r2.url]) + assert { + r.url + for r in queue.get_next_requests( + 10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10 + ) + } == {r3.url} + assert { + r.url + for r in queue.get_next_requests( + 10, 1, min_requests=3, min_hosts=1, max_requests_per_host=10 + ) + } == {r1.url, r2.url} def test_queue_with_delay(self): - connection = Connection(host='hbase-docker', port=9090) - queue = HBaseQueue(connection, 1, b'queue', True) + connection = self.get_connection() + queue = HBaseQueue(connection, 1, b"queue", True) r5 = r3.copy() crawl_at = int(time()) + 1000 - r5.meta[b'crawl_at'] = crawl_at - batch = [(r5.meta[b'fingerprint'], 0.5, r5, True)] + r5.meta[b"crawl_at"] = crawl_at + batch = [(r5.meta[b"fingerprint"], 0.5, r5, True)] queue.schedule(batch) - with mock.patch('frontera.contrib.backends.hbase.time') as mocked_time: + with mock.patch("frontera.contrib.backends.hbase.time") as mocked_time: mocked_time.return_value = time() - assert queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, - max_requests_per_host=10) == [] + assert ( + queue.get_next_requests( + 10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10 + ) + == [] + ) mocked_time.return_value = crawl_at + 1 - assert set([r.url for r in queue.get_next_requests(10, 0, min_requests=3, min_hosts=1, - max_requests_per_host=10)]) == set([r5.url]) + assert { + r.url + for r in queue.get_next_requests( + 10, 0, min_requests=3, min_hosts=1, max_requests_per_host=10 + ) + } == {r5.url} def test_state(self): - connection = Connection(host='hbase-docker', port=9090) - state = HBaseState(connection, b'metadata', 300000) + connection = self.get_connection() + state = HBaseState(connection, b"metadata", 300000) state.set_states([r1, r2, r3]) - assert [r.meta[b'state'] for r in [r1, r2, r3]] == [States.NOT_CRAWLED]*3 + assert [r.meta[b"state"] for r in [r1, r2, r3]] == [States.NOT_CRAWLED] * 3 state.update_cache([r1, r2, r3]) - assert state._state_cache == {b'10': States.NOT_CRAWLED, - b'11': States.NOT_CRAWLED, - b'12': States.NOT_CRAWLED} - r1.meta[b'state'] = States.CRAWLED - r2.meta[b'state'] = States.CRAWLED - r3.meta[b'state'] = States.CRAWLED + assert state._state_cache == { + b"10": States.NOT_CRAWLED, + b"11": States.NOT_CRAWLED, + b"12": States.NOT_CRAWLED, + } + r1.meta[b"state"] = States.CRAWLED + r2.meta[b"state"] = States.CRAWLED + r3.meta[b"state"] = States.CRAWLED state.update_cache([r1, r2, r3]) state.flush(True) assert state._state_cache == {} - state.fetch([b'10', b'11', b'12']) - assert state._state_cache == {b'10': States.CRAWLED, - b'11': States.CRAWLED, - b'12': States.CRAWLED} - r4.meta[b'state'] = States.ERROR + state.fetch([b"10", b"11", b"12"]) + assert state._state_cache == { + b"10": States.CRAWLED, + b"11": States.CRAWLED, + b"12": States.CRAWLED, + } + r4.meta[b"state"] = States.ERROR state.set_states([r1, r2, r4]) - assert r4.meta[b'state'] == States.CRAWLED + assert r4.meta[b"state"] == States.CRAWLED state.flush(True) assert state._state_cache == {} def test_drop_all_tables_when_table_name_is_str(self): - connection = Connection(host='hbase-docker', port=9090) + connection = self.get_connection() for table in connection.tables(): connection.delete_table(table, True) - hbase_queue_table = 'queue' - hbase_metadata_table = 'metadata' - connection.create_table(hbase_queue_table, {'f': {'max_versions': 1}}) - connection.create_table(hbase_metadata_table, {'f': {'max_versions': 1}}) + hbase_queue_table = "queue" + hbase_metadata_table = "metadata" + connection.create_table(hbase_queue_table, {"f": {"max_versions": 1}}) + connection.create_table(hbase_metadata_table, {"f": {"max_versions": 1}}) tables = connection.tables() - assert set(tables) == set([b'metadata', b'queue']) # Failure of test itself + assert set(tables) == {b"metadata", b"queue"} # Failure of test itself try: - HBaseQueue(connection=connection, partitions=1, table_name=hbase_queue_table, drop=True) - HBaseMetadata(connection=connection, table_name=hbase_metadata_table, drop_all_tables=True, - use_snappy=False, batch_size=300000, store_content=True) - except AlreadyExists: - assert False, "failed to drop hbase tables" + HBaseQueue( + connection=connection, + partitions=1, + table_name=hbase_queue_table, + drop=True, + ) + HBaseMetadata( + connection=connection, + table_name=hbase_metadata_table, + drop_all_tables=True, + use_snappy=False, + batch_size=300000, + store_content=True, + ) + except AlreadyExists as e: + raise AssertionError("failed to drop hbase tables") from e diff --git a/tests/contrib/backends/memory/__init__.py b/tests/contrib/backends/memory/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/contrib/backends/memory/test_backend_memory.py b/tests/contrib/backends/memory/test_backend_memory.py index 4b1c6cf79..efeec089b 100644 --- a/tests/contrib/backends/memory/test_backend_memory.py +++ b/tests/contrib/backends/memory/test_backend_memory.py @@ -1,31 +1,34 @@ -from __future__ import absolute_import -from tests.test_overused_buffer import DFSOverusedBackendTest +import pytest + +pytest.importorskip("sqlalchemy.engine") + from tests import backends +from tests.test_overused_buffer import DFSOverusedBackendTest class TestFIFO(backends.FIFOBackendTest): - backend_class = 'frontera.contrib.backends.memory.FIFO' + backend_class = "frontera.contrib.backends.memory.FIFO" class TestLIFO(backends.LIFOBackendTest): - backend_class = 'frontera.contrib.backends.memory.LIFO' + backend_class = "frontera.contrib.backends.memory.LIFO" class TestDFS(backends.DFSBackendTest): - backend_class = 'frontera.contrib.backends.memory.DFS' + backend_class = "frontera.contrib.backends.memory.DFS" class TestDFSOverused(backends.DFSBackendTest): - backend_class = 'frontera.contrib.backends.memory.MemoryDFSOverusedBackend' + backend_class = "frontera.contrib.backends.memory.MemoryDFSOverusedBackend" class TestDFSOverusedSimulation(DFSOverusedBackendTest): - backend_class = 'frontera.contrib.backends.memory.MemoryDFSOverusedBackend' + backend_class = "frontera.contrib.backends.memory.MemoryDFSOverusedBackend" class TestBFS(backends.BFSBackendTest): - backend_class = 'frontera.contrib.backends.memory.BFS' + backend_class = "frontera.contrib.backends.memory.BFS" class TestRANDOM(backends.RANDOMBackendTest): - backend_class = 'frontera.contrib.backends.memory.RANDOM' + backend_class = "frontera.contrib.backends.memory.RANDOM" diff --git a/tests/contrib/backends/sqlalchemy/test_backend_sqlalchemy.py b/tests/contrib/backends/sqlalchemy/test_backend_sqlalchemy.py index 0dceaaa7d..7fefd23c5 100644 --- a/tests/contrib/backends/sqlalchemy/test_backend_sqlalchemy.py +++ b/tests/contrib/backends/sqlalchemy/test_backend_sqlalchemy.py @@ -1,5 +1,10 @@ -from __future__ import absolute_import -import os +import pytest + +pytest.importorskip("sqlalchemy.engine") +pytest.importorskip("pymysql") + +import contextlib +from pathlib import Path import pymysql from psycopg2 import connect @@ -9,37 +14,36 @@ from tests.test_revisiting_backend import RevisitingBackendTest -#---------------------------------------------------- +# ---------------------------------------------------- # SQAlchemy base classes -#---------------------------------------------------- +# ---------------------------------------------------- class SQLAlchemyFIFO(backends.FIFOBackendTest): - backend_class = 'frontera.contrib.backends.sqlalchemy.FIFO' + backend_class = "frontera.contrib.backends.sqlalchemy.FIFO" class SQLAlchemyLIFO(backends.LIFOBackendTest): - backend_class = 'frontera.contrib.backends.sqlalchemy.LIFO' + backend_class = "frontera.contrib.backends.sqlalchemy.LIFO" class SQLAlchemyDFS(backends.DFSBackendTest): - backend_class = 'frontera.contrib.backends.sqlalchemy.DFS' + backend_class = "frontera.contrib.backends.sqlalchemy.DFS" class SQLAlchemyBFS(backends.BFSBackendTest): - backend_class = 'frontera.contrib.backends.sqlalchemy.BFS' + backend_class = "frontera.contrib.backends.sqlalchemy.BFS" class SQLAlchemyRevisiting(RevisitingBackendTest): - backend_class = 'frontera.contrib.backends.sqlalchemy.revisiting.Backend' + backend_class = "frontera.contrib.backends.sqlalchemy.revisiting.Backend" -#---------------------------------------------------- +# ---------------------------------------------------- # SQLite Memory -#---------------------------------------------------- +# ---------------------------------------------------- class SQLiteMemory(backends.BackendTest): - def get_settings(self): - settings = super(SQLiteMemory, self).get_settings() - settings.SQLALCHEMYBACKEND_ENGINE = 'sqlite:///:memory:' + settings = super().get_settings() + settings.SQLALCHEMYBACKEND_ENGINE = "sqlite:///:memory:" return settings @@ -63,16 +67,15 @@ class TestSQLiteMemoryRevisiting(SQLAlchemyRevisiting): pass -#---------------------------------------------------- +# ---------------------------------------------------- # SQLite File -#---------------------------------------------------- +# ---------------------------------------------------- class SQLiteFile(backends.BackendTest): - - SQLITE_DB_NAME = 'backend_test.db' + SQLITE_DB_NAME = "backend_test.db" def get_settings(self): - settings = super(SQLiteFile, self).get_settings() - settings.SQLALCHEMYBACKEND_ENGINE = 'sqlite:///' + self.SQLITE_DB_NAME + settings = super().get_settings() + settings.SQLALCHEMYBACKEND_ENGINE = "sqlite:///" + self.SQLITE_DB_NAME return settings def setup_backend(self, method): @@ -82,10 +85,8 @@ def teardown_backend(self, method): self._delete_test_db() def _delete_test_db(self): - try: - os.remove(self.SQLITE_DB_NAME) - except OSError: - pass + with contextlib.suppress(OSError): + Path(self.SQLITE_DB_NAME).unlink() class TestSQLiteFileFIFO(SQLAlchemyFIFO, SQLiteFile): @@ -104,19 +105,18 @@ class TestSQLiteFileBFS(SQLAlchemyBFS, SQLiteFile): pass -#---------------------------------------------------- +# ---------------------------------------------------- # DB Backend test base -#---------------------------------------------------- -class DBBackendTest(object): - - DB_DATABASE = 'backend_test' +# ---------------------------------------------------- +class DBBackendTest: + DB_DATABASE = "backend_test" DB_ENGINE = None DB_HOST = None DB_USER = None DB_PASSWORD = None def get_settings(self): - settings = super(DBBackendTest, self).get_settings() + settings = super().get_settings() settings.SQLALCHEMYBACKEND_ENGINE = self.DB_ENGINE return settings @@ -128,29 +128,28 @@ def teardown_backend(self, method): self._delete_database() def _delete_database(self): - self._execute_sql("DROP DATABASE IF EXISTS %s;" % self.DB_DATABASE) + self._execute_sql(f"DROP DATABASE IF EXISTS {self.DB_DATABASE};") def _create_database(self): - self._execute_sql("CREATE DATABASE %s;" % self.DB_DATABASE) + self._execute_sql(f"CREATE DATABASE {self.DB_DATABASE};") def _execute_sql(self, sql): raise NotImplementedError -#---------------------------------------------------- +# ---------------------------------------------------- # Mysql -#---------------------------------------------------- +# ---------------------------------------------------- class Mysql(DBBackendTest): - - DB_ENGINE = 'mysql+pymysql://root:@localhost/backend_test' - DB_HOST = 'localhost' - DB_USER = 'root' - DB_PASSWORD = '' + DB_ENGINE = "mysql+pymysql://root:@localhost/backend_test" + DB_HOST = "localhost" + DB_USER = "root" + DB_PASSWORD = "" def _execute_sql(self, sql): - conn = pymysql.connect(host=self.DB_HOST, - user=self.DB_USER, - passwd=self.DB_PASSWORD) + conn = pymysql.connect( + host=self.DB_HOST, user=self.DB_USER, passwd=self.DB_PASSWORD + ) cur = conn.cursor() cur.execute(sql) cur.close() @@ -173,20 +172,17 @@ class TestMysqlBFS(Mysql, SQLAlchemyBFS): pass -#---------------------------------------------------- +# ---------------------------------------------------- # Postgres -#---------------------------------------------------- +# ---------------------------------------------------- class Postgres(DBBackendTest): - - DB_ENGINE = 'postgres://postgres@localhost/backend_test' - DB_HOST = 'localhost' - DB_USER = 'postgres' - DB_PASSWORD = '' + DB_ENGINE = "postgres://postgres@localhost/backend_test" + DB_HOST = "localhost" + DB_USER = "postgres" + DB_PASSWORD = "" def _execute_sql(self, sql): - conn = connect(host=self.DB_HOST, - user=self.DB_USER, - password=self.DB_PASSWORD) + conn = connect(host=self.DB_HOST, user=self.DB_USER, password=self.DB_PASSWORD) conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = conn.cursor() cur.execute(sql) diff --git a/tests/mocks/boto.py b/tests/mocks/boto.py index db79ba50e..ba5d45d0c 100644 --- a/tests/mocks/boto.py +++ b/tests/mocks/boto.py @@ -1,8 +1,4 @@ -import six - - -class Content(object): - +class Content: def __init__(self, obj): self.obj = obj @@ -10,8 +6,7 @@ def split(self): return self.obj -class MockKey(object): - +class MockKey: def __init__(self, name, data): self.name = name self.content = Content(data) @@ -20,33 +15,31 @@ def get_contents_as_string(self, *args, **kwargs): return self.content -class MockBucket(object): - +class MockBucket: def __init__(self): self.keys = {} def list(self, prefix): - return [key for name, key in six.iteritems(self.keys) if name.startswith(prefix)] + return [key for name, key in self.keys.items() if name.startswith(prefix)] def add_key(self, name, data): if name in self.keys: - raise Exception('key: %s already exists' % name) + raise Exception(f"key: {name} already exists") self.keys[name] = MockKey(name, data) -class MockConnection(object): - +class MockConnection: def __init__(self): self.buckets = {} def get_bucket(self, bucket_name): try: return self.buckets[bucket_name] - except: - raise Exception('Bucket: %s not found' % bucket_name) + except Exception as e: + raise Exception(f"Bucket: {bucket_name} not found") from e def create_bucket(self, name): if name in self.buckets: - raise Exception('Bucket: %s already exists' % name) + raise Exception(f"Bucket: {name} already exists") self.buckets[name] = MockBucket() return self.buckets[name] diff --git a/tests/mocks/components.py b/tests/mocks/components.py index 801257f10..bd7939ac7 100644 --- a/tests/mocks/components.py +++ b/tests/mocks/components.py @@ -1,12 +1,14 @@ -from __future__ import absolute_import -from frontera.core.components import Backend, Middleware, CanonicalSolver, \ - DistributedBackend, Queue -from six.moves import range +from frontera.core.components import ( + Backend, + CanonicalSolver, + DistributedBackend, + Middleware, + Queue, +) from frontera.core.models import Request class FakeMiddleware(Middleware): - def __init__(self): self.seeds = [] self.responses = [] @@ -15,7 +17,7 @@ def __init__(self): self.lists = [self.seeds, self.responses, self.links, self.errors] self._started = False self._stopped = False - self.test_value = 'test' + self.test_value = "test" @classmethod def from_manager(cls, manager): @@ -47,7 +49,6 @@ def request_error(self, request, error): class FakeQueue(Queue): - def __init__(self): self.requests = [] @@ -56,11 +57,7 @@ def put_requests(self, requests): self.requests.append(request) def get_next_requests(self, max_next_requests, **kwargs): - lst = [] - for i in range(max_next_requests): - if self.requests: - lst.append(self.requests.pop()) - return lst + return [self.requests.pop() for _i in range(max_next_requests) if self.requests] def count(self): return len(self.requests) @@ -68,11 +65,12 @@ def count(self): def schedule(self, batch): for obj in batch: if obj[3]: - self.requests.append(Request(obj[2].url, meta={b'fingerprint': obj[0], b'score': obj[1]})) + self.requests.append( + Request(obj[2].url, meta={b"fingerprint": obj[0], b"score": obj[1]}) + ) class FakeBackend(FakeMiddleware, Backend): - _finished = False queue = FakeQueue() @@ -87,7 +85,6 @@ def get_next_requests(self, max_next_requests, **kwargs): class FakeDistributedBackend(FakeBackend, DistributedBackend): - def __init__(self): FakeBackend.__init__(self) self._queue = FakeQueue() @@ -112,7 +109,6 @@ def get_next_requests(self, max_next_request, partitions, **kwargs): class FakeMiddlewareBlocking(FakeMiddleware): - def add_seeds(self, seeds): for seed in seeds: self.seeds.append(seed) @@ -129,19 +125,17 @@ def request_error(self, request, error): class FakeMiddlewareModifySeeds(FakeMiddleware): - def add_seeds(self, seeds): for seed in seeds: self.seeds.append(seed) - seed.meta[b'test_seeds'] = self.test_value + seed.meta[b"test_seeds"] = self.test_value return seeds class FakeMiddlewareModifyResponse(FakeMiddleware): - def page_crawled(self, response): self.responses.append(response) - response.meta[b'test_response'] = self.test_value + response.meta[b"test_response"] = self.test_value return response def links_extracted(self, request, links): @@ -151,7 +145,6 @@ def links_extracted(self, request, links): class FakeMiddlewareModifyLinks(FakeMiddleware): - def page_crawled(self, response): self.responses.append(response) return response @@ -159,15 +152,15 @@ def page_crawled(self, response): def links_extracted(self, request, links): for link in links: self.links.append(link) - link.meta[b'test_links'] = self.test_value + link.meta[b"test_links"] = self.test_value return request -class FakeCanonicalSolver(CanonicalSolver, FakeMiddleware): +class FakeCanonicalSolver(CanonicalSolver, FakeMiddleware): def add_seeds(self, seeds): for seed in seeds: self.seeds.append(seed) - seed.meta[b'test_seeds_canonical_solver'] = self.test_value + seed.meta[b"test_seeds_canonical_solver"] = self.test_value return seeds def page_crawled(self, response): @@ -177,5 +170,5 @@ def page_crawled(self, response): def links_extracted(self, request, links): for link in links: self.links.append(link) - link.meta[b'test_links_canonical_solver'] = self.test_value + link.meta[b"test_links_canonical_solver"] = self.test_value return request diff --git a/tests/mocks/crawler.py b/tests/mocks/crawler.py index 946724fd8..f55e20b53 100644 --- a/tests/mocks/crawler.py +++ b/tests/mocks/crawler.py @@ -1,31 +1,32 @@ -from __future__ import absolute_import from scrapy.settings import Settings -from frontera.utils.misc import load_object -import six - -class FakeCrawler(object): +from frontera.utils.misc import load_object - class Slot(object): +class FakeCrawler: + class Slot: def __init__(self, active=0, concurrency=0): self.active = active self.concurrency = concurrency def __init__(self, settings=None): self.settings = settings or Settings() - self.stats = load_object(self.settings['STATS_CLASS'])(self) - dummy_class = type('class', (object,), {}) + self.stats = load_object(self.settings["STATS_CLASS"])(self) + dummy_class = type("class", (object,), {}) downloader = dummy_class() downloader.slots = {} - downloader.domain_concurrency = self.settings.get('CONCURRENT_REQUESTS_PER_DOMAIN') - downloader.ip_concurrency = self.settings.get('CONCURRENT_REQUESTS_PER_IP') + downloader.domain_concurrency = self.settings.get( + "CONCURRENT_REQUESTS_PER_DOMAIN" + ) + downloader.ip_concurrency = self.settings.get("CONCURRENT_REQUESTS_PER_IP") self.engine = dummy_class() self.engine.downloader = downloader - self.engine.downloader.total_concurrency = self.settings.getint('CONCURRENT_REQUESTS') + self.engine.downloader.total_concurrency = self.settings.getint( + "CONCURRENT_REQUESTS" + ) def set_slots(self, slotDict): slots = {} - for key, slotPair in six.iteritems(slotDict): + for key, slotPair in slotDict.items(): slots[key] = self.Slot(slotPair[0], slotPair[1]) self.engine.downloader.slots = slots diff --git a/tests/mocks/frontier_manager.py b/tests/mocks/frontier_manager.py index 0736ab353..ce89a5612 100644 --- a/tests/mocks/frontier_manager.py +++ b/tests/mocks/frontier_manager.py @@ -1,13 +1,10 @@ -from __future__ import absolute_import from frontera.settings import Settings -from six.moves import range -class FakeFrontierManager(object): - +class FakeFrontierManager: def __init__(self, settings): self.settings = settings - self.auto_start = settings.get('AUTO_START') + self.auto_start = settings.get("AUTO_START") self.iteration = 0 self.finished = False self._started = True @@ -40,13 +37,9 @@ def put_requests(self, requests): def get_next_requests(self, max_next_requests=0, **kwargs): self.get_next_requests_kwargs.append(kwargs) - max_next_requests = max_next_requests or self.settings.get('MAX_NEXT_REQUESTS') - lst = [] - for i in range(max_next_requests): - if self.requests: - lst.append(self.requests.pop()) + max_next_requests = max_next_requests or self.settings.get("MAX_NEXT_REQUESTS") self.iteration += 1 - return lst + return [self.requests.pop() for _i in range(max_next_requests) if self.requests] def page_crawled(self, response): self.responses.append(response) @@ -58,5 +51,3 @@ def links_extracted(self, request, links): def request_error(self, request, error): self.errors.append((request, error)) - - diff --git a/tests/mocks/load_objects.py b/tests/mocks/load_objects.py index b6a03e743..b574f3083 100644 --- a/tests/mocks/load_objects.py +++ b/tests/mocks/load_objects.py @@ -1,8 +1,7 @@ -mock_variable = 'test' +mock_variable = "test" -class MockClass(object): - +class MockClass: val = 10 def __init__(self, val): diff --git a/tests/mocks/message_bus.py b/tests/mocks/message_bus.py index f8b6f582b..11e41e060 100644 --- a/tests/mocks/message_bus.py +++ b/tests/mocks/message_bus.py @@ -1,14 +1,20 @@ -from frontera.core.messagebus import BaseMessageBus, BaseSpiderLogStream, BaseStreamConsumer, \ - BaseScoringLogStream, BaseSpiderFeedStream +from frontera.core.messagebus import ( + BaseMessageBus, + BaseScoringLogStream, + BaseSpiderFeedStream, + BaseSpiderLogStream, + BaseStreamConsumer, +) class Consumer(BaseStreamConsumer): - def __init__(self): self.messages = [] self.offset = None - def put_messages(self, messages=[]): + def put_messages(self, messages=None): + if messages is None: + messages = [] self.messages += messages def get_messages(self, timeout=0, count=1): @@ -27,8 +33,7 @@ def get_offset(self, partition_id): return self.offset -class Producer(object): - +class Producer: def __init__(self): self.messages = [] self.offset = 0 @@ -44,7 +49,6 @@ def get_offset(self, partition_id): class ScoringLogStream(BaseScoringLogStream): - def __init__(self, messagebus): pass @@ -56,7 +60,6 @@ def consumer(self): class SpiderLogStream(BaseSpiderLogStream): - def __init__(self, messagebus): pass @@ -68,7 +71,6 @@ def consumer(self, partition_id, type): class SpiderFeedStream(BaseSpiderFeedStream): - def __init__(self, messagebus): self.ready_partitions = set(messagebus.spider_feed_partitions) @@ -89,11 +91,12 @@ def mark_busy(self, partition_id): class FakeMessageBus(BaseMessageBus): - def __init__(self, settings): - self.spider_log_partitions = [i for i in range(settings.get('SPIDER_LOG_PARTITIONS'))] - self.spider_feed_partitions = [i for i in range(settings.get('SPIDER_FEED_PARTITIONS'))] - self.max_next_requests = settings.get('MAX_NEXT_REQUESTS') + self.spider_log_partitions = list(range(settings.get("SPIDER_LOG_PARTITIONS"))) + self.spider_feed_partitions = list( + range(settings.get("SPIDER_FEED_PARTITIONS")) + ) + self.max_next_requests = settings.get("MAX_NEXT_REQUESTS") def spider_log(self): return SpiderLogStream(self) diff --git a/tests/scrapy_spider/frontera/settings.py b/tests/scrapy_spider/frontera/settings.py index fd2786d9c..814e8341a 100644 --- a/tests/scrapy_spider/frontera/settings.py +++ b/tests/scrapy_spider/frontera/settings.py @@ -1,13 +1,13 @@ -#-------------------------------------------------------- +# -------------------------------------------------------- # Frontier -#-------------------------------------------------------- -BACKEND = 'frontera.contrib.backends.memory.FIFO' +# -------------------------------------------------------- +BACKEND = "frontera.contrib.backends.memory.FIFO" MAX_REQUESTS = 5 MAX_NEXT_REQUESTS = 1 -#-------------------------------------------------------- +# -------------------------------------------------------- # Logging -#-------------------------------------------------------- +# -------------------------------------------------------- LOGGING_EVENTS_ENABLED = False LOGGING_MANAGER_ENABLED = False LOGGING_BACKEND_ENABLED = False diff --git a/tests/scrapy_spider/settings.py b/tests/scrapy_spider/settings.py index fa5c0326a..f76e38b2d 100644 --- a/tests/scrapy_spider/settings.py +++ b/tests/scrapy_spider/settings.py @@ -1,10 +1,10 @@ -#-------------------------------------------------------------------------- +# -------------------------------------------------------------------------- # Scrapy Settings -#-------------------------------------------------------------------------- -BOT_NAME = 'scrapy_spider' +# -------------------------------------------------------------------------- +BOT_NAME = "scrapy_spider" -SPIDER_MODULES = ['tests.scrapy_spider.spiders'] -NEWSPIDER_MODULE = 'tests.scrapy_spider.spiders' +SPIDER_MODULES = ["tests.scrapy_spider.spiders"] +NEWSPIDER_MODULE = "tests.scrapy_spider.spiders" HTTPCACHE_ENABLED = False REDIRECT_ENABLED = True @@ -17,20 +17,20 @@ LOGSTATS_INTERVAL = 10 -#-------------------------------------------------------------------------- +# -------------------------------------------------------------------------- # Frontier Settings -#-------------------------------------------------------------------------- +# -------------------------------------------------------------------------- SPIDER_MIDDLEWARES = { - 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware': 999 + "frontera.contrib.scrapy.middlewares.schedulers.SchedulerSpiderMiddleware": 999 } DOWNLOADER_MIDDLEWARES = { - 'frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware': 999 + "frontera.contrib.scrapy.middlewares.schedulers.SchedulerDownloaderMiddleware": 999 } -SCHEDULER = 'frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler' -FRONTERA_SETTINGS = 'tests.scrapy_spider.frontera.settings' +SCHEDULER = "frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler" +FRONTERA_SETTINGS = "tests.scrapy_spider.frontera.settings" -#-------------------------------------------------------------------------- +# -------------------------------------------------------------------------- # Testing -#-------------------------------------------------------------------------- -#CLOSESPIDER_PAGECOUNT = 1 +# -------------------------------------------------------------------------- +# CLOSESPIDER_PAGECOUNT = 1 diff --git a/tests/scrapy_spider/spiders/example.py b/tests/scrapy_spider/spiders/example.py index 000c7c3b8..4e1b40377 100644 --- a/tests/scrapy_spider/spiders/example.py +++ b/tests/scrapy_spider/spiders/example.py @@ -1,21 +1,10 @@ -from __future__ import absolute_import -from scrapy.linkextractors import LinkExtractor -from scrapy.spiders import CrawlSpider, Rule +from scrapy import Spider -class MySpider(CrawlSpider): - name = 'example' - start_urls = ['http://www.dmoz.org'] +class MySpider(Spider): + name = "example" + start_urls = ["data:,"] callback_calls = 0 - rules = [Rule(LinkExtractor(), - callback='parse_page', follow=True)] - - def parse_page(self, response): + def parse(self, response): self.callback_calls += 1 - pass - - def parse_nothing(self, response): - pass - - parse_start_url = parse_nothing diff --git a/tests/test_canonical_solver.py b/tests/test_canonical_solver.py index 41cc36252..796ca750c 100644 --- a/tests/test_canonical_solver.py +++ b/tests/test_canonical_solver.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import from frontera.contrib.canonicalsolvers import Basic, CorporateWebsiteFriendly from frontera.core.models import Request, Response from frontera.utils.fingerprint import sha1 @@ -8,9 +6,9 @@ def single_node_chain(url1, url2): r = Request(url=url1) re = Response(url=url2, request=r) - re.meta[b'fingerprint'] = sha1(url2) - re.meta[b'redirect_urls'] = [url1] - re.meta[b'redirect_fingerprints'] = [sha1(url1)] + re.meta[b"fingerprint"] = sha1(url2) + re.meta[b"redirect_urls"] = [url1] + re.meta[b"redirect_fingerprints"] = [sha1(url1)] return re @@ -38,4 +36,3 @@ def test_corporate_website_friendly(): re = single_node_chain("http://www.yandex.ru", "http://www.yandex.ru/search") cs.page_crawled(re) assert re.url == "http://www.yandex.ru/search" - diff --git a/tests/test_codecs.py b/tests/test_codecs.py index 82136f14b..64c7e7c4b 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -1,13 +1,27 @@ -# -*- coding: utf-8 -*- +import pytest + +pytest.importorskip("msgpack") -from __future__ import absolute_import import json import unittest -from frontera.contrib.backends.remote.codecs.json import (Encoder as JsonEncoder, Decoder as JsonDecoder, - _convert_and_save_type, _convert_from_saved_type) -from frontera.contrib.backends.remote.codecs.msgpack import Encoder as MsgPackEncoder, Decoder as MsgPackDecoder + +from frontera.contrib.backends.remote.codecs.json import ( + Decoder as JsonDecoder, +) +from frontera.contrib.backends.remote.codecs.json import ( + Encoder as JsonEncoder, +) +from frontera.contrib.backends.remote.codecs.json import ( + _convert_and_save_type, + _convert_from_saved_type, +) +from frontera.contrib.backends.remote.codecs.msgpack import ( + Decoder as MsgPackDecoder, +) +from frontera.contrib.backends.remote.codecs.msgpack import ( + Encoder as MsgPackEncoder, +) from frontera.core.models import Request, Response -import pytest def _compare_dicts(dict1, dict2): @@ -15,7 +29,7 @@ def _compare_dicts(dict1, dict2): Compares two dicts :return: True if both dicts are equal else False """ - if dict1 == None or dict2 == None: + if dict1 is None or dict2 is None: return False if type(dict1) is not dict or type(dict2) is not dict: @@ -23,15 +37,19 @@ def _compare_dicts(dict1, dict2): shared_keys = set(dict2.keys()) & set(dict2.keys()) - if not (len(shared_keys) == len(dict1.keys()) and len(shared_keys) == len(dict2.keys())): + if not ( + len(shared_keys) == len(dict1.keys()) and len(shared_keys) == len(dict2.keys()) + ): return False dicts_are_equal = True - for key in dict1.keys(): + for key in dict1: if type(dict1[key]) is dict: dicts_are_equal = _compare_dicts(dict1[key], dict2[key]) else: - dicts_are_equal = (dict1[key] == dict2[key]) and (type(dict1[key]) == type(dict2[key])) + dicts_are_equal = (dict1[key] == dict2[key]) and ( + type(dict1[key]) is type(dict2[key]) + ) if not dicts_are_equal: return False @@ -39,80 +57,96 @@ def _compare_dicts(dict1, dict2): return dicts_are_equal -@pytest.mark.parametrize('send_body', [True, False]) +@pytest.mark.parametrize("send_body", [True, False]) @pytest.mark.parametrize( - ('encoder', 'decoder'), [ - (MsgPackEncoder, MsgPackDecoder), - (JsonEncoder, JsonDecoder) - ] + ("encoder", "decoder"), + [(MsgPackEncoder, MsgPackDecoder), (JsonEncoder, JsonDecoder)], ) def test_codec(encoder, decoder, send_body): def check_request(req1, req2): - assert req1.url == req2.url and _compare_dicts(req1.meta, req2.meta) == True and \ - _compare_dicts(req1.headers, req2.headers) == True and req1.method == req2.method + assert ( + req1.url == req2.url + and _compare_dicts(req1.meta, req2.meta) is True + and _compare_dicts(req1.headers, req2.headers) is True + and req1.method == req2.method + ) enc = encoder(Request, send_body=send_body) dec = decoder(Request, Response) - req = Request(url="http://www.yandex.ru", method=b'GET', - meta={b'test': b'shmest', b'scrapy_meta': {'rule': 0, 'key': 'value'}}, headers={b'reqhdr': b'value'}) + req = Request( + url="http://www.yandex.ru", + method=b"GET", + meta={b"test": b"shmest", b"scrapy_meta": {"rule": 0, "key": "value"}}, + headers={b"reqhdr": b"value"}, + ) req2 = Request(url="http://www.yandex.ru/search") msgs = [ enc.encode_add_seeds([req]), - enc.encode_page_crawled(Response(url="http://www.yandex.ru", body=b'SOME CONTENT', headers={b'hdr': b'value'}, - request=req)), + enc.encode_page_crawled( + Response( + url="http://www.yandex.ru", + body=b"SOME CONTENT", + headers={b"hdr": b"value"}, + request=req, + ) + ), enc.encode_links_extracted(req, [req2]), enc.encode_request_error(req, "Host not found"), enc.encode_update_score(req, 0.51, True), enc.encode_new_job_id(1), enc.encode_offset(0, 28796), - enc.encode_request(req) + enc.encode_request(req), ] it = iter(msgs) o = dec.decode(next(it)) - assert o[0] == 'add_seeds' - assert type(o[1]) == list + assert o[0] == "add_seeds" + assert type(o[1]) is list req_d = o[1][0] check_request(req_d, req) - assert type(req_d) == Request + assert type(req_d) is Request o = dec.decode(next(it)) - assert o[0] == 'page_crawled' - assert type(o[1]) == Response + assert o[0] == "page_crawled" + assert type(o[1]) is Response assert o[1].url == req.url and o[1].meta == req.meta if send_body: - o[1].body == b'SOME CONTENT' + assert o[1].body == b"SOME CONTENT" else: - o[1].body is None + assert o[1].body is None o = dec.decode(next(it)) print(o) - assert o[0] == 'links_extracted' - assert type(o[1]) == Request + assert o[0] == "links_extracted" + assert type(o[1]) is Request assert o[1].url == req.url and o[1].meta == req.meta - assert type(o[2]) == list + assert type(o[2]) is list req_d = o[2][0] - assert type(req_d) == Request + assert type(req_d) is Request assert req_d.url == req2.url o_type, o_req, o_error = dec.decode(next(it)) - assert o_type == 'request_error' + assert o_type == "request_error" check_request(o_req, req) assert o_error == "Host not found" o_type, o_req2, score, schedule = dec.decode(next(it)) - assert o_type == 'update_score' - assert o_req2.url == req.url and o_req2.meta == req.meta and o_req2.headers == req.headers + assert o_type == "update_score" + assert ( + o_req2.url == req.url + and o_req2.meta == req.meta + and o_req2.headers == req.headers + ) assert score == 0.51 assert schedule is True o_type, job_id = dec.decode(next(it)) - assert o_type == 'new_job_id' + assert o_type == "new_job_id" assert job_id == 1 o_type, partition_id, offset = dec.decode(next(it)) - assert o_type == 'offset' + assert o_type == "offset" assert partition_id == 0 assert offset == 28796 @@ -127,21 +161,53 @@ class TestEncodeDecodeJson(unittest.TestCase): def test_encode_decode_json_recursively(self): _int = 1 - _bytes = b'bytes' - _unicode = u'unicode' + _bytes = b"bytes" + _unicode = "unicode" _bool = True _none = None - simple_dict = {'key': 'value'} - simple_list = ['item', 1] - simple_tuple = ('str', 2) - mixed_type_dict = {b'k1': 'v1', 'k2': b'v2', 'int': 1, b'none': None, 'bool': False} - mixed_type_list = [b'i1', 'i2', 23, None, True] - mixed_type_tuple = [b'i1', 'i2', 23, None, True] - nested_dict = {'k1': b'v1', 'lst': [b'i1', 1, ('str', 1, {'k2': b'v1', 'tup': (1, None)})]} - nested_list = [True, None, (1, 2, 3), {b'k1': b'v1', 'tup': ('a', b'b', [None, False])}] - nested_tuple = (1, None, ['a', 'b', True, {b'k1': 'v2', 'lst': ['a', False, (2, 3, 5)]}]) - msgs = [_int, _bytes, _unicode, _bool, _none, simple_dict, simple_list, simple_tuple, - mixed_type_dict, mixed_type_list, mixed_type_tuple, nested_dict, nested_list, nested_tuple] + simple_dict = {"key": "value"} + simple_list = ["item", 1] + simple_tuple = ("str", 2) + mixed_type_dict = { + b"k1": "v1", + "k2": b"v2", + "int": 1, + b"none": None, + "bool": False, + } + mixed_type_list = [b"i1", "i2", 23, None, True] + mixed_type_tuple = [b"i1", "i2", 23, None, True] + nested_dict = { + "k1": b"v1", + "lst": [b"i1", 1, ("str", 1, {"k2": b"v1", "tup": (1, None)})], + } + nested_list = [ + True, + None, + (1, 2, 3), + {b"k1": b"v1", "tup": ("a", b"b", [None, False])}, + ] + nested_tuple = ( + 1, + None, + ["a", "b", True, {b"k1": "v2", "lst": ["a", False, (2, 3, 5)]}], + ) + msgs = [ + _int, + _bytes, + _unicode, + _bool, + _none, + simple_dict, + simple_list, + simple_tuple, + mixed_type_dict, + mixed_type_list, + mixed_type_tuple, + nested_dict, + nested_list, + nested_tuple, + ] encoder = json.JSONEncoder() decoder = json.JSONDecoder() for original_msg in msgs: diff --git a/tests/test_core_overused_buffer.py b/tests/test_core_overused_buffer.py index f08e32933..2d069f747 100644 --- a/tests/test_core_overused_buffer.py +++ b/tests/test_core_overused_buffer.py @@ -1,28 +1,20 @@ -from __future__ import absolute_import from frontera.core import OverusedBuffer from frontera.core.models import Request -from six.moves import range +r1 = Request("http://www.example.com") +r2 = Request("http://www.example.com/some/") +r3 = Request("htttp://www.example.com/some/page/") +r4 = Request("http://example.com") +r5 = Request("http://example.com/some/page") +r6 = Request("http://example1.com") -r1 = Request('http://www.example.com') -r2 = Request('http://www.example.com/some/') -r3 = Request('htttp://www.example.com/some/page/') -r4 = Request('http://example.com') -r5 = Request('http://example.com/some/page') -r6 = Request('http://example1.com') - - -class TestOverusedBuffer(object): +class TestOverusedBuffer: requests = [] logs = [] def get_func(self, max_n_requests, **kwargs): - lst = [] - for _ in range(max_n_requests): - if self.requests: - lst.append(self.requests.pop()) - return lst + return [self.requests.pop() for _ in range(max_n_requests) if self.requests] def log_func(self, msg): self.logs.append(msg) @@ -30,30 +22,38 @@ def log_func(self, msg): def test(self): ob = OverusedBuffer(self.get_func, self.log_func) self.requests = [r1, r2, r3, r4, r5, r6] - assert set(ob.get_next_requests(10, overused_keys=['www.example.com', 'example1.com'], - key_type='domain')) == set([r4, r5]) - assert set(self.logs) == set(["Overused keys: ['www.example.com', 'example1.com']", - "Pending: 0"]) + assert set( + ob.get_next_requests( + 10, overused_keys=["www.example.com", "example1.com"], key_type="domain" + ) + ) == {r4, r5} + assert set(self.logs) == { + "Overused keys: ['www.example.com', 'example1.com']", + "Pending: 0", + } self.logs = [] - assert ob.get_next_requests(10, overused_keys=['www.example.com'], - key_type='domain') == [r6] - assert set(self.logs) == set(["Overused keys: ['www.example.com']", - "Pending: 4"]) + assert ob.get_next_requests( + 10, overused_keys=["www.example.com"], key_type="domain" + ) == [r6] + assert set(self.logs) == {"Overused keys: ['www.example.com']", "Pending: 4"} self.logs = [] - assert ob.get_next_requests(10, overused_keys=['www.example.com'], - key_type='domain') == [] - assert set(self.logs) == set(["Overused keys: ['www.example.com']", - "Pending: 3"]) + assert ( + ob.get_next_requests( + 10, overused_keys=["www.example.com"], key_type="domain" + ) + == [] + ) + assert set(self.logs) == {"Overused keys: ['www.example.com']", "Pending: 3"} self.logs = [] - #the max_next_requests is 3 here to cover the "len(requests) == max_next_requests" case. - assert set(ob.get_next_requests(3, overused_keys=['example.com'], - key_type='domain')) == set([r1, r2, r3]) - assert set(self.logs) == set(["Overused keys: ['example.com']", - "Pending: 3"]) + # the max_next_requests is 3 here to cover the "len(requests) == max_next_requests" case. + assert set( + ob.get_next_requests(3, overused_keys=["example.com"], key_type="domain") + ) == {r1, r2, r3} + assert set(self.logs) == {"Overused keys: ['example.com']", "Pending: 3"} self.logs = [] - assert ob.get_next_requests(10, overused_keys=[], key_type='domain') == [] - assert set(self.logs) == set(["Overused keys: []", "Pending: 0"]) + assert ob.get_next_requests(10, overused_keys=[], key_type="domain") == [] + assert set(self.logs) == {"Overused keys: []", "Pending: 0"} diff --git a/tests/test_domain_mware.py b/tests/test_domain_mware.py index ecf06169f..b1dc78920 100644 --- a/tests/test_domain_mware.py +++ b/tests/test_domain_mware.py @@ -1,11 +1,14 @@ -from __future__ import absolute_import +import pytest + +pytest.importorskip("tldextract") + import unittest + from frontera.contrib.middlewares.domain import DomainMiddleware -from frontera.core.manager import FrontierManager from frontera.core.models import Request -class FakeManager(object): +class FakeManager: settings = {} test_mode = False @@ -19,46 +22,70 @@ def test_create(self): def test_should_parse_domain_info(self): seeds = [ - Request('http://example.com'), - Request('https://www.google.com'), + Request("http://example.com"), + Request("https://www.google.com"), ] mware = DomainMiddleware(self.fake_manager) result = mware.add_seeds(seeds) - self.assertEquals(len(result), len(seeds)) + self.assertEqual(len(result), len(seeds)) for r in result: - self.assertIn(b'domain', r.meta, 'Missing domain info for %r' % r) + self.assertIn(b"domain", r.meta, f"Missing domain info for {r!r}") expected = [ - {b'name': b'example.com', b'netloc': b'example.com', b'scheme': b'http', - b'sld': b'', b'subdomain': b'', b'tld': b''}, - {b'name': b'www.google.com', b'netloc': b'www.google.com', b'scheme': b'https', - b'sld': b'', b'subdomain': b'', b'tld': b''}, + { + b"name": b"example.com", + b"netloc": b"example.com", + b"scheme": b"http", + b"sld": b"", + b"subdomain": b"", + b"tld": b"", + }, + { + b"name": b"www.google.com", + b"netloc": b"www.google.com", + b"scheme": b"https", + b"sld": b"", + b"subdomain": b"", + b"tld": b"", + }, ] - self.assertEquals(expected, [r.meta[b'domain'] for r in result]) + self.assertEqual(expected, [r.meta[b"domain"] for r in result]) def test_should_parse_tldextract_extra_domain_info(self): seeds = [ - Request('http://example.com'), - Request('https://www.google.com'), + Request("http://example.com"), + Request("https://www.google.com"), ] - self.fake_manager.settings = {'TLDEXTRACT_DOMAIN_INFO': True} + self.fake_manager.settings = {"TLDEXTRACT_DOMAIN_INFO": True} mware = DomainMiddleware(self.fake_manager) result = mware.add_seeds(seeds) - self.assertEquals(len(result), len(seeds)) + self.assertEqual(len(result), len(seeds)) for r in result: - self.assertIn(b'domain', r.meta, 'Missing domain info for %r' % r) + self.assertIn(b"domain", r.meta, f"Missing domain info for {r!r}") expected = [ - {b'name': b'example.com', b'netloc': b'example.com', b'scheme': b'http', - b'sld': b'example', b'subdomain': b'', b'tld': b'com'}, - {b'name': b'google.com', b'netloc': b'www.google.com', b'scheme': b'https', - b'sld': b'google', b'subdomain': b'www', b'tld': b'com'}, + { + b"name": b"example.com", + b"netloc": b"example.com", + b"scheme": b"http", + b"sld": b"example", + b"subdomain": b"", + b"tld": b"com", + }, + { + b"name": b"google.com", + b"netloc": b"www.google.com", + b"scheme": b"https", + b"sld": b"google", + b"subdomain": b"www", + b"tld": b"com", + }, ] - self.assertEquals(expected, [r.meta[b'domain'] for r in result]) + self.assertEqual(expected, [r.meta[b"domain"] for r in result]) diff --git a/tests/test_filters.py b/tests/test_filters.py index 796faecdb..ff4e6b527 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -1,12 +1,12 @@ import unittest -from frontera.logger.filters import PLAINVALUES, INCLUDEFIELDS, EXCLUDEFIELDS +from frontera.logger.filters import EXCLUDEFIELDS, INCLUDEFIELDS, PLAINVALUES from tests.utils import LoggingCaptureMixin, SetupDefaultLoggingMixin class BaseTestFilters(SetupDefaultLoggingMixin, LoggingCaptureMixin, unittest.TestCase): def tearDown(self): - super(BaseTestFilters, self).setUp() + super().setUp() self.logger.handlers[0].filters = [] def addFilter(self, filter): @@ -15,108 +15,114 @@ def addFilter(self, filter): class TestFilterPlainValues(BaseTestFilters): def test_plain_values_exclude_fields(self): - filter = PLAINVALUES(excluded_fields=['event']) + filter = PLAINVALUES(excluded_fields=["event"]) self.addFilter(filter) - self.logger.debug({'message1': 'logging', 'message2': 'debug', 'event': 'value'}) + self.logger.debug( + {"message1": "logging", "message2": "debug", "event": "value"} + ) log_msg = self.logger_output.getvalue() - assert log_msg == 'logging debug\n' or log_msg == 'debug logging\n' + assert log_msg in ("logging debug\n", "debug logging\n") def test_plain_values_separator(self): - filter = PLAINVALUES(separator=',') + filter = PLAINVALUES(separator=",") self.addFilter(filter) - self.logger.debug({'message1': 'logging', 'message2': 'debug'}) + self.logger.debug({"message1": "logging", "message2": "debug"}) log_msg = self.logger_output.getvalue() - assert log_msg == 'logging,debug\n' or log_msg == 'debug,logging\n' + assert log_msg in ("logging,debug\n", "debug,logging\n") def test_plain_values_msg_max_length(self): filter = PLAINVALUES(msg_max_length=10) self.addFilter(filter) - self.logger.debug({'message1': '1' * 10, 'message2': '2' * 10}) + self.logger.debug({"message1": "1" * 10, "message2": "2" * 10}) log_msg = self.logger_output.getvalue() - assert log_msg == '%s...\n' % ('1' * 7) or log_msg == '%s...\n' % ('2' * 7) + assert log_msg in (f"{'1' * 7}...\n", f"{'2' * 7}...\n") def test_plain_values_str_msg(self): filter = PLAINVALUES(msg_max_length=10) self.addFilter(filter) - self.logger.debug('debug message') - self.assertEqual(self.logger_output.getvalue(), 'debug message\n') + self.logger.debug("debug message") + self.assertEqual(self.logger_output.getvalue(), "debug message\n") class TestIncludeFields(BaseTestFilters): def test_include_fields_matching_values(self): - filter = INCLUDEFIELDS(field_name='event', included_values=['page_crawled']) + filter = INCLUDEFIELDS(field_name="event", included_values=["page_crawled"]) self.addFilter(filter) - self.logger.debug('crawled page P', extra={'event': 'page_crawled'}) - self.assertEqual(self.logger_output.getvalue(), 'crawled page P\n') + self.logger.debug("crawled page P", extra={"event": "page_crawled"}) + self.assertEqual(self.logger_output.getvalue(), "crawled page P\n") def test_include_fields_non_matching_values(self): - filter = INCLUDEFIELDS(field_name='event', included_values=['links_extracted']) + filter = INCLUDEFIELDS(field_name="event", included_values=["links_extracted"]) self.addFilter(filter) - self.logger.debug('crawled page P', extra={'event': 'page_crawled'}) - self.assertEqual(self.logger_output.getvalue(), '') + self.logger.debug("crawled page P", extra={"event": "page_crawled"}) + self.assertEqual(self.logger_output.getvalue(), "") def test_include_fields_dict_msg_matching_values(self): - filter = INCLUDEFIELDS(field_name='event', included_values=['page_crawled']) + filter = INCLUDEFIELDS(field_name="event", included_values=["page_crawled"]) self.addFilter(filter) - self.logger.debug({'message': 'debug message', 'event': 'page_crawled'}) + self.logger.debug({"message": "debug message", "event": "page_crawled"}) log_msg = self.logger_output.getvalue() - assert log_msg == "{'event': 'page_crawled', 'message': 'debug message'}\n" or \ - log_msg == "{'message': 'debug message', 'event': 'page_crawled'}\n" + assert log_msg in ( + "{'event': 'page_crawled', 'message': 'debug message'}\n", + "{'message': 'debug message', 'event': 'page_crawled'}\n", + ) def test_include_fields_dict_msg_non_matching_values(self): - filter = INCLUDEFIELDS(field_name='event', included_values=['links_extracted']) + filter = INCLUDEFIELDS(field_name="event", included_values=["links_extracted"]) self.addFilter(filter) - self.logger.debug({'message': 'debug message', 'event': 'page_crawled'}) - self.assertEqual(self.logger_output.getvalue(), '') + self.logger.debug({"message": "debug message", "event": "page_crawled"}) + self.assertEqual(self.logger_output.getvalue(), "") def test_include_fields_field_name_none(self): filter = INCLUDEFIELDS(field_name=None, included_values=[]) self.addFilter(filter) - self.logger.debug('debug message') - self.assertEqual(self.logger_output.getvalue(), 'debug message\n') + self.logger.debug("debug message") + self.assertEqual(self.logger_output.getvalue(), "debug message\n") def test_include_fields_list_message(self): - filter = INCLUDEFIELDS(field_name='event', included_values=['page_crawled']) + filter = INCLUDEFIELDS(field_name="event", included_values=["page_crawled"]) self.addFilter(filter) - self.logger.debug(['debug message']) + self.logger.debug(["debug message"]) self.assertEqual(self.logger_output.getvalue(), "['debug message']\n") class TestExcludeFields(BaseTestFilters): def test_exclude_fields_matching_values(self): - filter = EXCLUDEFIELDS(field_name='event', excluded_fields=['page_crawled']) + filter = EXCLUDEFIELDS(field_name="event", excluded_fields=["page_crawled"]) self.addFilter(filter) - self.logger.debug('crawled page P', extra={'event': 'page_crawled'}) - self.assertEqual(self.logger_output.getvalue(), '') + self.logger.debug("crawled page P", extra={"event": "page_crawled"}) + self.assertEqual(self.logger_output.getvalue(), "") def test_exclude_fields_non_matching_values(self): - filter = EXCLUDEFIELDS(field_name='event', excluded_fields=['links_extracted']) + filter = EXCLUDEFIELDS(field_name="event", excluded_fields=["links_extracted"]) self.addFilter(filter) - self.logger.debug('crawled page P', extra={'event': 'page_crawled'}) - self.assertEqual(self.logger_output.getvalue(), 'crawled page P\n') + self.logger.debug("crawled page P", extra={"event": "page_crawled"}) + self.assertEqual(self.logger_output.getvalue(), "crawled page P\n") def test_exclude_fields_dict_msg_matching_values(self): - filter = EXCLUDEFIELDS(field_name='event', excluded_fields='page_crawled') + filter = EXCLUDEFIELDS(field_name="event", excluded_fields="page_crawled") self.addFilter(filter) - self.logger.debug({'message': 'debug message', 'event': 'page_crawled'}) - self.assertEqual(self.logger_output.getvalue(), '') + self.logger.debug({"message": "debug message", "event": "page_crawled"}) + self.assertEqual(self.logger_output.getvalue(), "") def test_exclude_fields_dict_msg_non_matching_values(self): - filter = EXCLUDEFIELDS(field_name='event', excluded_fields='links_extracted') + filter = EXCLUDEFIELDS(field_name="event", excluded_fields="links_extracted") self.addFilter(filter) - self.logger.debug({'message': 'debug message', 'event': 'page_crawled'}) + self.logger.debug({"message": "debug message", "event": "page_crawled"}) log_msg = self.logger_output.getvalue() - assert log_msg == "{'event': 'page_crawled', 'message': 'debug message'}\n" or \ - log_msg == "{'message': 'debug message', 'event': 'page_crawled'}\n" + assert log_msg in ( + "{'event': 'page_crawled', 'message': 'debug message'}\n", + "{'message': 'debug message', 'event': 'page_crawled'}\n", + ) def test_include_fields_field_name_none(self): filter = EXCLUDEFIELDS(field_name=None, excluded_fields=[]) self.addFilter(filter) - self.logger.debug('debug message') - self.assertEqual(self.logger_output.getvalue(), 'debug message\n') + self.logger.debug("debug message") + self.assertEqual(self.logger_output.getvalue(), "debug message\n") def test_include_fields_list_message(self): - filter = EXCLUDEFIELDS(field_name='event', excluded_fields=['page_crawled']) + filter = EXCLUDEFIELDS(field_name="event", excluded_fields=["page_crawled"]) self.addFilter(filter) - self.logger.debug(['debug message']) + self.logger.debug(["debug message"]) self.assertEqual(self.logger_output.getvalue(), "['debug message']\n") diff --git a/tests/test_fingerprint.py b/tests/test_fingerprint.py index f4b4ca33b..aecaa83e7 100644 --- a/tests/test_fingerprint.py +++ b/tests/test_fingerprint.py @@ -1,42 +1,57 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from frontera.utils.fingerprint import hostname_local_fingerprint, sha1, md5 from w3lib.util import to_bytes +from frontera.utils.fingerprint import hostname_local_fingerprint, md5, sha1 -url1 = u"https://news.yandex.ru/yandsearch?cl4url=top.rbc.ru/politics/14/07/2015/55a50b509a79473f583e104c&lang=ru&lr=54#fragment" -url2 = u"TestString" -url3 = u"http://www.example.com/some/page\u5000/" +url1 = "https://news.yandex.ru/yandsearch?cl4url=top.rbc.ru/politics/14/07/2015/55a50b509a79473f583e104c&lang=ru&lr=54#fragment" +url2 = "TestString" +url3 = "http://www.example.com/some/page\u5000/" -class TestFingerprint(object): - +class TestFingerprint: def test_sha1_bytes(self): - assert sha1(to_bytes(url1)) == b'880c5e7919cb09e182bd639d724bce6d90db71eb' - assert sha1(to_bytes(url2)) == b'd598b03bee8866ae03b54cb6912efdfef107fd6d' - assert sha1(to_bytes(url3)) == b'28bf812b6421a46ee5bcf40c05a82e8f051ab88e' + assert sha1(to_bytes(url1)) == b"880c5e7919cb09e182bd639d724bce6d90db71eb" + assert sha1(to_bytes(url2)) == b"d598b03bee8866ae03b54cb6912efdfef107fd6d" + assert sha1(to_bytes(url3)) == b"28bf812b6421a46ee5bcf40c05a82e8f051ab88e" def test_sha1_unicode(self): - assert sha1(url1) == b'880c5e7919cb09e182bd639d724bce6d90db71eb' - assert sha1(url2) == b'd598b03bee8866ae03b54cb6912efdfef107fd6d' - assert sha1(url3) == b'28bf812b6421a46ee5bcf40c05a82e8f051ab88e' + assert sha1(url1) == b"880c5e7919cb09e182bd639d724bce6d90db71eb" + assert sha1(url2) == b"d598b03bee8866ae03b54cb6912efdfef107fd6d" + assert sha1(url3) == b"28bf812b6421a46ee5bcf40c05a82e8f051ab88e" def test_md5_bytes(self): - assert md5(to_bytes(url1)) == b'bb82110ce034c1a6ad55a2e73adc322a' - assert md5(to_bytes(url2)) == b'5b56f40f8828701f97fa4511ddcd25fb' - assert md5(to_bytes(url3)) == b'5abf5c9aa02d870756032bdec0bd6522' + assert md5(to_bytes(url1)) == b"bb82110ce034c1a6ad55a2e73adc322a" + assert md5(to_bytes(url2)) == b"5b56f40f8828701f97fa4511ddcd25fb" + assert md5(to_bytes(url3)) == b"5abf5c9aa02d870756032bdec0bd6522" def test_md5_unicode(self): - assert md5(url1) == b'bb82110ce034c1a6ad55a2e73adc322a' - assert md5(url2) == b'5b56f40f8828701f97fa4511ddcd25fb' - assert md5(url3) == b'5abf5c9aa02d870756032bdec0bd6522' + assert md5(url1) == b"bb82110ce034c1a6ad55a2e73adc322a" + assert md5(url2) == b"5b56f40f8828701f97fa4511ddcd25fb" + assert md5(url3) == b"5abf5c9aa02d870756032bdec0bd6522" def test_local_hostname_fingerprint_bytes(self): - assert hostname_local_fingerprint(to_bytes(url1)) == b'1be68ff556fd0bbe5802d1a100850da29f7f15b1' - assert hostname_local_fingerprint(to_bytes(url2)) == b'd598b03bee8866ae03b54cb6912efdfef107fd6d' - assert hostname_local_fingerprint(to_bytes(url3)) == b'2ed642bbdf514b8520ab28f5da589ab28eda10a6' + assert ( + hostname_local_fingerprint(to_bytes(url1)) + == b"1be68ff556fd0bbe5802d1a100850da29f7f15b1" + ) + assert ( + hostname_local_fingerprint(to_bytes(url2)) + == b"d598b03bee8866ae03b54cb6912efdfef107fd6d" + ) + assert ( + hostname_local_fingerprint(to_bytes(url3)) + == b"2ed642bbdf514b8520ab28f5da589ab28eda10a6" + ) def test_local_hostname_frongerprint_unicode(self): - assert hostname_local_fingerprint(url1) == b'1be68ff556fd0bbe5802d1a100850da29f7f15b1' - assert hostname_local_fingerprint(url2) == b'd598b03bee8866ae03b54cb6912efdfef107fd6d' - assert hostname_local_fingerprint(url3) == b'2ed642bbdf514b8520ab28f5da589ab28eda10a6' + assert ( + hostname_local_fingerprint(url1) + == b"1be68ff556fd0bbe5802d1a100850da29f7f15b1" + ) + assert ( + hostname_local_fingerprint(url2) + == b"d598b03bee8866ae03b54cb6912efdfef107fd6d" + ) + assert ( + hostname_local_fingerprint(url3) + == b"2ed642bbdf514b8520ab28f5da589ab28eda10a6" + ) diff --git a/tests/test_formatters.py b/tests/test_formatters.py index 065fab2fd..7a3e19851 100644 --- a/tests/test_formatters.py +++ b/tests/test_formatters.py @@ -1,22 +1,27 @@ -import unittest -import re -import json +import pytest + +pytest.importorskip("colorlog") + import datetime +import json +import re +import unittest +from frontera.logger.formatters import CONSOLE from frontera.logger.formatters.color import ColorFormatter from frontera.logger.formatters.json import JSONFormatter -from frontera.logger.formatters import CONSOLE - from tests.utils import LoggingCaptureMixin, SetupDefaultLoggingMixin, colors -class BaseTestFormatters(SetupDefaultLoggingMixin, LoggingCaptureMixin, unittest.TestCase): +class BaseTestFormatters( + SetupDefaultLoggingMixin, LoggingCaptureMixin, unittest.TestCase +): def setUp(self): - super(BaseTestFormatters, self).setUp() + super().setUp() self.default_formatter = self.logger.handlers[0].formatter def tearDown(self): - super(BaseTestFormatters, self).setUp() + super().setUp() self.logger.handlers[0].formatter = self.default_formatter def setFormatter(self, formatter): @@ -24,7 +29,6 @@ def setFormatter(self, formatter): class TestFormatterColor(BaseTestFormatters): - def test_formatter_color(self): c = ColorFormatter( format="%(log_color)s [%(name)s] %(message)s", @@ -33,18 +37,23 @@ def test_formatter_color(self): "INFO": "green", "ERROR": "red", }, - log_color_field="levelname") + log_color_field="levelname", + ) self.setFormatter(c) - self.logger.debug('debug message') - self.logger.info('info message') - self.logger.error('error message') - self.assertEqual(self.logger_output.getvalue(), - '{white} [frontera] debug message{reset}\n' - '{green} [frontera] info message{reset}\n' - '{red} [frontera] error message{reset}\n'.format(white=colors['white'], - green=colors['green'], - red=colors['red'], - reset=colors['reset'])) + self.logger.debug("debug message") + self.logger.info("info message") + self.logger.error("error message") + self.assertEqual( + self.logger_output.getvalue(), + "{white} [frontera] debug message{reset}\n" + "{green} [frontera] info message{reset}\n" + "{red} [frontera] error message{reset}\n".format( + white=colors["white"], + green=colors["green"], + red=colors["red"], + reset=colors["reset"], + ), + ) def test_formatter_color_datefmt(self): c = ColorFormatter( @@ -54,63 +63,71 @@ def test_formatter_color_datefmt(self): "INFO": "green", "ERROR": "red", }, - datefmt='%d-%m-%Y %H:%M:%S', - log_color_field="levelname") + datefmt="%d-%m-%Y %H:%M:%S", + log_color_field="levelname", + ) self.setFormatter(c) - self.logger.debug('debug message') - self.assertRegexpMatches(self.logger_output.getvalue(), - '{white} \d{{2}}-\d{{2}}-\d{{4}} \d{{2}}:\d{{2}}:\d{{2}} ' - '\\[frontera\\] debug message{reset}\n'.format( - white=re.escape(colors['white']), - reset=re.escape(colors['reset']))) + self.logger.debug("debug message") + self.assertRegex( + self.logger_output.getvalue(), + r"{white} \d{{2}}-\d{{2}}-\d{{4}} \d{{2}}:\d{{2}}:\d{{2}} " + "\\[frontera\\] debug message{reset}\n".format( + white=re.escape(colors["white"]), reset=re.escape(colors["reset"]) + ), + ) class TestFormatterJson(BaseTestFormatters): - def setUp(self): - super(TestFormatterJson, self).setUp() + super().setUp() self.setFormatter(JSONFormatter()) def test_formatter_json_log_text(self): - self.logger.debug('debug message') - self.assertEqual(json.loads(self.logger_output.getvalue())['message'], 'debug message') + self.logger.debug("debug message") + self.assertEqual( + json.loads(self.logger_output.getvalue())["message"], "debug message" + ) def test_formatter_json_log_dict(self): dct_msg = { - 'message': 'debug message', - 'extra': 'value', + "message": "debug message", + "extra": "value", } self.logger.debug(dct_msg) json_log = json.loads(self.logger_output.getvalue()) - self.assertEqual(json_log.get('message'), 'debug message') - self.assertEqual(json_log.get('extra'), 'value') + self.assertEqual(json_log.get("message"), "debug message") + self.assertEqual(json_log.get("extra"), "value") def test_formatter_json_log_datetime_objects(self): dct_msg = { - 'message': 'debug message', - 'datetime': datetime.datetime(2016, 9, 19, 23, 59), - 'date': datetime.date(2016, 9, 20), - 'timedelta': datetime.datetime(2016, 9, 19, 23, 59) - datetime.datetime(2016, 9, 19, 23, 50), + "message": "debug message", + "datetime": datetime.datetime(2016, 9, 19, 23, 59), + "date": datetime.date(2016, 9, 20), + "timedelta": datetime.datetime(2016, 9, 19, 23, 59) + - datetime.datetime(2016, 9, 19, 23, 50), } self.logger.debug(dct_msg) json_log = json.loads(self.logger_output.getvalue()) - self.assertEqual(json_log.get('message'), 'debug message') - self.assertEqual(json_log.get('datetime'), '2016-09-19T23:59:00') - self.assertEqual(json_log.get('date'), '2016-09-20') - self.assertEqual(json_log.get('timedelta'), '00:09:00') + self.assertEqual(json_log.get("message"), "debug message") + self.assertEqual(json_log.get("datetime"), "2016-09-19T23:59:00") + self.assertEqual(json_log.get("date"), "2016-09-20") + self.assertEqual(json_log.get("timedelta"), "00:09:00") class TestFormatterMiscellaneous(BaseTestFormatters): - def test_formatter_console(self): self.setFormatter(CONSOLE) - self.logger.debug('debug message') - self.logger.info('info message') - self.logger.error('error message') - self.assertEqual(self.logger_output.getvalue(), - '{white}[frontera] debug message{reset}\n' - '{green}[frontera] info message{reset}\n' - '{red}[frontera] error message{reset}\n'.format(white=colors['white'], - green=colors['green'], - red=colors['red'], - reset=colors['reset'])) + self.logger.debug("debug message") + self.logger.info("info message") + self.logger.error("error message") + self.assertEqual( + self.logger_output.getvalue(), + "{white}[frontera] debug message{reset}\n" + "{green}[frontera] info message{reset}\n" + "{red}[frontera] error message{reset}\n".format( + white=colors["white"], + green=colors["green"], + red=colors["red"], + reset=colors["reset"], + ), + ) diff --git a/tests/test_frontera_scheduler.py b/tests/test_frontera_scheduler.py index fe1b7a50a..a879ee4d5 100644 --- a/tests/test_frontera_scheduler.py +++ b/tests/test_frontera_scheduler.py @@ -1,35 +1,36 @@ -from __future__ import absolute_import -from frontera.contrib.scrapy.schedulers.frontier import FronteraScheduler -from tests.mocks.frontier_manager import FakeFrontierManager -from tests.mocks.crawler import FakeCrawler -from frontera.core.models import Request as FRequest -from frontera.core.models import Response as FResponse +import pytest + +pytest.importorskip("scrapy") + from scrapy.http import Request, Response -from scrapy.spiders import Spider from scrapy.settings import Settings -from six.moves import range +from scrapy.spiders import Spider +from frontera.contrib.scrapy.schedulers.frontier import FronteraScheduler +from frontera.core.models import Request as FRequest +from frontera.core.models import Response as FResponse +from tests.mocks.crawler import FakeCrawler +from tests.mocks.frontier_manager import FakeFrontierManager # test requests -r1 = Request('http://www.example.com') -r2 = Request('https://www.example.com/some/page') -r3 = Request('http://example1.com') +r1 = Request("http://www.example.com") +r2 = Request("https://www.example.com/some/page") +r3 = Request("http://example1.com") # test requests with redirects -rr1 = Request('http://www.example.com', meta={b'redirect_times': 1}) -rr2 = Request('https://www.example.com/some/page', meta={b'redirect_times': 4}) -rr3 = Request('http://example1.com', meta={b'redirect_times': 0}) +rr1 = Request("http://www.example.com", meta={b"redirect_times": 1}) +rr2 = Request("https://www.example.com/some/page", meta={b"redirect_times": 4}) +rr3 = Request("http://example1.com", meta={b"redirect_times": 0}) # test frontier requests -fr1 = FRequest('http://www.example.com') -fr2 = Request('https://www.example.com/some/page') -fr3 = Request('http://example1.com') - +fr1 = FRequest("http://www.example.com") +fr2 = Request("https://www.example.com/some/page") +fr3 = Request("http://example1.com") -class TestFronteraScheduler(object): +class TestFronteraScheduler: def test_enqueue_requests(self): crawler = FakeCrawler() fs = FronteraScheduler(crawler, manager=FakeFrontierManager) @@ -37,13 +38,17 @@ def test_enqueue_requests(self): assert fs.enqueue_request(r1) is True assert fs.enqueue_request(r2) is True assert fs.enqueue_request(r3) is True - assert set(seed.url for seed in fs.frontier.manager.seeds) == set([r1.url, r2.url, r3.url]) - assert all([isinstance(seed, FRequest) for seed in fs.frontier.manager.seeds]) - assert fs.stats_manager.stats.get_value('frontera/seeds_count') == 3 + assert {seed.url for seed in fs.frontier.manager.seeds} == { + r1.url, + r2.url, + r3.url, + } + assert all(isinstance(seed, FRequest) for seed in fs.frontier.manager.seeds) + assert fs.stats_manager.stats.get_value("frontera/seeds_count") == 3 def test_redirect_disabled_enqueue_requests(self): settings = Settings() - settings['REDIRECT_ENABLED'] = False + settings["REDIRECT_ENABLED"] = False crawler = FakeCrawler(settings) fs = FronteraScheduler(crawler, manager=FakeFrontierManager) fs.open(Spider) @@ -53,11 +58,11 @@ def test_redirect_disabled_enqueue_requests(self): assert isinstance(fs.frontier.manager.seeds[0], FRequest) assert len(fs.frontier.manager.seeds) == 1 assert fs.frontier.manager.seeds[0].url == rr3.url - assert fs.stats_manager.stats.get_value('frontera/seeds_count') == 1 + assert fs.stats_manager.stats.get_value("frontera/seeds_count") == 1 def test_redirect_enabled_enqueue_requests(self): settings = Settings() - settings['REDIRECT_ENABLED'] = True + settings["REDIRECT_ENABLED"] = True crawler = FakeCrawler(settings) fs = FronteraScheduler(crawler, manager=FakeFrontierManager) fs.open(Spider) @@ -67,10 +72,12 @@ def test_redirect_enabled_enqueue_requests(self): assert len(fs.frontier.manager.seeds) == 1 assert isinstance(fs.frontier.manager.seeds[0], FRequest) assert fs.frontier.manager.seeds[0].url == rr3.url - assert set([request.url for request in fs._pending_requests]) == set([rr1.url, rr2.url]) - assert all([isinstance(request, Request) for request in fs._pending_requests]) - assert fs.stats_manager.stats.get_value('frontera/seeds_count') == 1 - assert fs.stats_manager.stats.get_value('frontera/redirected_requests_count') == 2 + assert {request.url for request in fs._pending_requests} == {rr1.url, rr2.url} + assert all(isinstance(request, Request) for request in fs._pending_requests) + assert fs.stats_manager.stats.get_value("frontera/seeds_count") == 1 + assert ( + fs.stats_manager.stats.get_value("frontera/redirected_requests_count") == 2 + ) def test_next_request(self): crawler = FakeCrawler() @@ -78,9 +85,9 @@ def test_next_request(self): fs.open(Spider) fs.frontier.manager.put_requests([fr1, fr2, fr3]) requests = [fs.next_request() for _ in range(3)] - assert set([request.url for request in requests]) == set([fr1.url, fr2.url, fr3.url]) - assert all([isinstance(request, Request) for request in requests]) - assert fs.stats_manager.stats.get_value('frontera/returned_requests_count') == 3 + assert {request.url for request in requests} == {fr1.url, fr2.url, fr3.url} + assert all(isinstance(request, Request) for request in requests) + assert fs.stats_manager.stats.get_value("frontera/returned_requests_count") == 3 def test_next_request_manager_finished(self): crawler = FakeCrawler() @@ -89,16 +96,23 @@ def test_next_request_manager_finished(self): fs.frontier.manager.put_requests([fr1]) fs.frontier.manager.finished = True assert fs.next_request() is None - assert fs.stats_manager.stats.get_value('frontera/returned_requests_count') is None + assert ( + fs.stats_manager.stats.get_value("frontera/returned_requests_count") is None + ) def test_next_request_overused_keys_info(self): settings = Settings() - settings['CONCURRENT_REQUESTS_PER_DOMAIN'] = 0 - settings['CONCURRENT_REQUESTS_PER_IP'] = 5 + settings["CONCURRENT_REQUESTS_PER_DOMAIN"] = 0 + settings["CONCURRENT_REQUESTS_PER_IP"] = 5 crawler = FakeCrawler(settings) # the keys in the slot_dict are ip's, the first value in the pair is the # slot.active list(only it's length is needed) and the second value is slot.concurrency. - slot_dict = {'1.2.3': ([0]*3, 1), '2.1.3': ([0]*30, 2), '3.2.2': ([0]*5, 1), '4.1.3': ([0]*110, 20)} + slot_dict = { + "1.2.3": ([0] * 3, 1), + "2.1.3": ([0] * 30, 2), + "3.2.2": ([0] * 5, 1), + "4.1.3": ([0] * 110, 20), + } crawler.set_slots(slot_dict) fs = FronteraScheduler(crawler, manager=FakeFrontierManager) fs.open(Spider) @@ -106,39 +120,54 @@ def test_next_request_overused_keys_info(self): request = fs.next_request() assert request.url == fr1.url assert isinstance(request, Request) - assert fs.frontier.manager.get_next_requests_kwargs[0]['key_type'] == 'ip' - assert set(fs.frontier.manager.get_next_requests_kwargs[0]['overused_keys']) == set(['2.1.3', '4.1.3']) - assert fs.stats_manager.stats.get_value('frontera/returned_requests_count') == 1 + assert fs.frontier.manager.get_next_requests_kwargs[0]["key_type"] == "ip" + assert set( + fs.frontier.manager.get_next_requests_kwargs[0]["overused_keys"] + ) == {"2.1.3", "4.1.3"} + assert fs.stats_manager.stats.get_value("frontera/returned_requests_count") == 1 def test_process_spider_output(self): - i1 = {'name': 'item', 'item': 'i1'} - i2 = {'name': 'item', 'item': 'i2'} + i1 = {"name": "item", "item": "i1"} + i2 = {"name": "item", "item": "i2"} result = [r1, r2, r3, i1, i2] - resp = Response(fr1.url, request=Request(fr1.url, meta={b'frontier_request': fr1})) + resp = Response( + fr1.url, request=Request(fr1.url, meta={b"frontier_request": fr1}) + ) crawler = FakeCrawler() fs = FronteraScheduler(crawler, manager=FakeFrontierManager) fs.open(Spider) - assert sorted(list(fs.process_spider_output(resp, result, Spider)), key=lambda i: sorted(i['item'])) == \ - sorted([i1, i2], key=lambda i: sorted(i['item'])) + assert sorted( + fs.process_spider_output(resp, result, Spider), + key=lambda i: sorted(i["item"]), + ) == sorted([i1, i2], key=lambda i: sorted(i["item"])) assert isinstance(fs.frontier.manager.responses[0], FResponse) assert fs.frontier.manager.responses[0].url == resp.url - assert set([request.url for request in fs.frontier.manager.links]) == set([r1.url, r2.url, r3.url]) - assert all([isinstance(request, FRequest) for request in fs.frontier.manager.links]) - assert fs.stats_manager.stats.get_value('frontera/crawled_pages_count') == 1 - assert fs.stats_manager.stats.get_value('frontera/crawled_pages_count/200') == 1 - assert fs.stats_manager.stats.get_value('frontera/links_extracted_count') == 3 + assert {request.url for request in fs.frontier.manager.links} == { + r1.url, + r2.url, + r3.url, + } + assert all( + isinstance(request, FRequest) for request in fs.frontier.manager.links + ) + assert fs.stats_manager.stats.get_value("frontera/crawled_pages_count") == 1 + assert fs.stats_manager.stats.get_value("frontera/crawled_pages_count/200") == 1 + assert fs.stats_manager.stats.get_value("frontera/links_extracted_count") == 3 def test_process_exception(self): - exception = type('exception', (object,), {}) + exception = type("exception", (object,), {}) crawler = FakeCrawler() fs = FronteraScheduler(crawler, manager=FakeFrontierManager) fs.open(Spider) fs.process_exception(r1, exception(), Spider) error = fs.frontier.manager.errors.pop() assert error[0].url == r1.url - assert error[1] == 'exception' - assert fs.stats_manager.stats.get_value('frontera/request_errors_count') == 1 - assert fs.stats_manager.stats.get_value('frontera/request_errors_count/exception') == 1 + assert error[1] == "exception" + assert fs.stats_manager.stats.get_value("frontera/request_errors_count") == 1 + assert ( + fs.stats_manager.stats.get_value("frontera/request_errors_count/exception") + == 1 + ) def test_close(self): crawler = FakeCrawler() @@ -147,7 +176,7 @@ def test_close(self): fs.frontier.manager.put_requests([fr1, fr2, fr3]) fs.next_request() fs.frontier.manager.iteration = 5 - fs.close('reason') + fs.close("reason") assert fs.frontier.manager._stopped is True - assert fs.stats_manager.stats.get_value('frontera/pending_requests_count') == 2 - assert fs.stats_manager.stats.get_value('frontera/iterations') == 5 + assert fs.stats_manager.stats.get_value("frontera/pending_requests_count") == 2 + assert fs.stats_manager.stats.get_value("frontera/iterations") == 5 diff --git a/tests/test_frontier_manager.py b/tests/test_frontier_manager.py index 60d57970e..8ee1f41f2 100644 --- a/tests/test_frontier_manager.py +++ b/tests/test_frontier_manager.py @@ -1,32 +1,30 @@ -from __future__ import absolute_import from frontera.core.manager import FrontierManager -from frontera.settings import Settings from frontera.core.models import Request, Response -from six.moves import range - - -r1 = Request('http://www.example.com') -r2 = Request('https://www.example.com/some/page') -r3 = Request('http://example1.com') +from frontera.settings import Settings +r1 = Request("http://www.example.com") +r2 = Request("https://www.example.com/some/page") +r3 = Request("http://example1.com") -class TestFrontierManager(object): +class TestFrontierManager: def setup_frontier_manager(self, settings=None): settings = settings or Settings() - settings.BACKEND = 'tests.mocks.components.FakeBackend' - settings.MIDDLEWARES = ['tests.mocks.components.FakeMiddleware', - 'tests.mocks.components.FakeMiddlewareModifySeeds', - 'tests.mocks.components.FakeMiddlewareModifyResponse', - 'tests.mocks.components.FakeMiddlewareModifyLinks'] - settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' + settings.BACKEND = "tests.mocks.components.FakeBackend" + settings.MIDDLEWARES = [ + "tests.mocks.components.FakeMiddleware", + "tests.mocks.components.FakeMiddlewareModifySeeds", + "tests.mocks.components.FakeMiddlewareModifyResponse", + "tests.mocks.components.FakeMiddlewareModifyLinks", + ] + settings.CANONICAL_SOLVER = "tests.mocks.components.FakeCanonicalSolver" return FrontierManager.from_settings(settings) def test_start(self): fm = self.setup_frontier_manager() assert fm._started is True assert fm.backend._started is True - assert [mw._started for mw in fm.middlewares] == [True]*4 + assert [mw._started for mw in fm.middlewares] == [True] * 4 assert fm.canonicalsolver._started is True def test_stop(self): @@ -34,14 +32,14 @@ def test_stop(self): fm.stop() assert fm._stopped is True assert fm.backend._stopped is True - assert [mw._stopped for mw in fm.middlewares] == [True]*4 + assert [mw._stopped for mw in fm.middlewares] == [True] * 4 assert fm.canonicalsolver._stopped is True def test_properties(self): fm = self.setup_frontier_manager() - assert fm.test_mode == fm.settings.get('TEST_MODE') - assert fm.max_next_requests == fm.settings.get('MAX_NEXT_REQUESTS') - assert fm.auto_start == fm.settings.get('AUTO_START') + assert fm.test_mode == fm.settings.get("TEST_MODE") + assert fm.max_next_requests == fm.settings.get("MAX_NEXT_REQUESTS") + assert fm.auto_start == fm.settings.get("AUTO_START") assert fm.iteration == 0 assert fm.n_requests == 0 assert fm.finished is False @@ -50,48 +48,52 @@ def test_add_seeds(self): fm = self.setup_frontier_manager() fm.add_seeds([r1, r2, r3]) - #seeds reached backend. - assert set([seed for seed in fm.backend.seeds]) == set([r1, r2, r3]) - #seeds reached canonicalsolver - assert set([seed for seed in fm.canonicalsolver.seeds]) == set([r1, r2, r3]) - #seeds reached the 4 middlewares. - assert [set([seed for seed in mw.seeds]) for mw in fm.middlewares] == [set([r1, r2, r3])]*4 - #seeds were modified. - assert [seed.meta[b'test_seeds'] for seed in [r1, r2, r3]] == ['test']*3 - assert [seed.meta[b'test_seeds_canonical_solver'] for seed in [r1, r2, r3]] == ['test']*3 + # seeds reached backend. + assert set(fm.backend.seeds) == {r1, r2, r3} + # seeds reached canonicalsolver + assert set(fm.canonicalsolver.seeds) == {r1, r2, r3} + # seeds reached the 4 middlewares. + assert [set(mw.seeds) for mw in fm.middlewares] == [{r1, r2, r3}] * 4 + # seeds were modified. + assert [seed.meta[b"test_seeds"] for seed in [r1, r2, r3]] == ["test"] * 3 + assert [seed.meta[b"test_seeds_canonical_solver"] for seed in [r1, r2, r3]] == [ + "test" + ] * 3 def test_page_crawled(self): fm = self.setup_frontier_manager() response = Response(r1.url, request=r1) fm.page_crawled(response) assert fm.backend.responses.pop() == response - assert [mw.responses.pop() for mw in fm.middlewares] == [response]*4 + assert [mw.responses.pop() for mw in fm.middlewares] == [response] * 4 assert fm.canonicalsolver.responses.pop() == response - assert response.meta[b'test_response'] == 'test' + assert response.meta[b"test_response"] == "test" def test_links_extracted(self): fm = self.setup_frontier_manager() - response = Response(r1.url, request=r1) + Response(r1.url, request=r1) fm.links_extracted(r1, links=[r2, r3]) - assert set([link for link in fm.backend.links]) == set([r2, r3]) - assert set([link for link in fm.canonicalsolver.links]) == set([r2, r3]) - assert [set([link for link in mw.links]) for mw in fm.middlewares] == [set([r2, r3])]*4 - assert [link.meta[b'test_links'] for link in [r2, r3]] == ['test']*2 - assert [link.meta[b'test_links_canonical_solver'] for link in [r2, r3]] == ['test']*2 + assert set(fm.backend.links) == {r2, r3} + assert set(fm.canonicalsolver.links) == {r2, r3} + assert [set(mw.links) for mw in fm.middlewares] == [{r2, r3}] * 4 + assert [link.meta[b"test_links"] for link in [r2, r3]] == ["test"] * 2 + assert [link.meta[b"test_links_canonical_solver"] for link in [r2, r3]] == [ + "test" + ] * 2 def test_get_next_requests(self): fm = self.setup_frontier_manager() fm.backend.put_requests([r1, r2, r3]) - assert set(fm.get_next_requests(3)) == set([r1, r2, r3]) + assert set(fm.get_next_requests(3)) == {r1, r2, r3} assert fm.iteration == 1 assert fm.n_requests == 3 def test_request_error(self): fm = self.setup_frontier_manager() - fm.request_error(r1, 'error') - assert fm.backend.errors.pop() == (r1, 'error') - assert [mw.errors.pop() for mw in fm.middlewares] == [(r1, 'error')]*4 - assert fm.canonicalsolver.errors.pop() == (r1, 'error') + fm.request_error(r1, "error") + assert fm.backend.errors.pop() == (r1, "error") + assert [mw.errors.pop() for mw in fm.middlewares] == [(r1, "error")] * 4 + assert fm.canonicalsolver.errors.pop() == (r1, "error") def test_max_requests_reached(self): settings = Settings() @@ -99,32 +101,38 @@ def test_max_requests_reached(self): fm = self.setup_frontier_manager(settings) fm.backend.put_requests([r1, r2, r3]) requests = set(fm.get_next_requests(10)) - assert requests == set([r1, r2]) or requests == set([r2, r3]) or requests == set([r1, r3]) + assert requests in ({r1, r2}, {r2, r3}, {r1, r3}) assert fm.get_next_requests(10) == [] assert fm.finished is True def test_blocking_middleware(self): settings = Settings() - settings.BACKEND = 'tests.mocks.components.FakeBackend' - settings.MIDDLEWARES = ['tests.mocks.components.FakeMiddleware', - 'tests.mocks.components.FakeMiddlewareModifySeeds', - 'tests.mocks.components.FakeMiddlewareBlocking', - 'tests.mocks.components.FakeMiddlewareModifyResponse', - 'tests.mocks.components.FakeMiddlewareModifyLinks'] - settings.CANONICAL_SOLVER = 'tests.mocks.components.FakeCanonicalSolver' + settings.BACKEND = "tests.mocks.components.FakeBackend" + settings.MIDDLEWARES = [ + "tests.mocks.components.FakeMiddleware", + "tests.mocks.components.FakeMiddlewareModifySeeds", + "tests.mocks.components.FakeMiddlewareBlocking", + "tests.mocks.components.FakeMiddlewareModifyResponse", + "tests.mocks.components.FakeMiddlewareModifyLinks", + ] + settings.CANONICAL_SOLVER = "tests.mocks.components.FakeCanonicalSolver" fm = FrontierManager.from_settings(settings) fm.add_seeds([r1, r2, r3]) response = Response(r1.url, request=r1) fm.page_crawled(response) fm.links_extracted(r1, links=[r2]) - fm.request_error(r3, 'error') - - #the seeds, responses, links and errors have not reached the backend. - assert [len(list) for list in fm.backend.lists] == [0]*4 - #the 3 seeds reach the first three middlewares. - assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3]*3 - #the error, response and link reached the first three middlewares. - assert [[len(list) for list in fm.middlewares[i].lists[1:]] for i in range(3)] == [[1]*3]*3 - #the values do not reach the bottom 2 middlewares and the canonical solver. - assert [[len(list) for list in fm.middlewares[i].lists] for i in range(3, 5)] == [[0]*4]*2 - assert [len(list) for list in fm.canonicalsolver.lists] == [0]*4 + fm.request_error(r3, "error") + + # the seeds, responses, links and errors have not reached the backend. + assert [len(list) for list in fm.backend.lists] == [0] * 4 + # the 3 seeds reach the first three middlewares. + assert [len(fm.middlewares[i].seeds) for i in range(3)] == [3] * 3 + # the error, response and link reached the first three middlewares. + assert [ + [len(list) for list in fm.middlewares[i].lists[1:]] for i in range(3) + ] == [[1] * 3] * 3 + # the values do not reach the bottom 2 middlewares and the canonical solver. + assert [ + [len(list) for list in fm.middlewares[i].lists] for i in range(3, 5) + ] == [[0] * 4] * 2 + assert [len(list) for list in fm.canonicalsolver.lists] == [0] * 4 diff --git a/tests/test_handlers.py b/tests/test_handlers.py index deda7ba9d..2342188a0 100644 --- a/tests/test_handlers.py +++ b/tests/test_handlers.py @@ -1,30 +1,38 @@ -import unittest +import pytest + +pytest.importorskip("colorlog") + import logging import logging.config +import unittest from frontera.logger.handlers import CONSOLE -from tests.utils import SetupDefaultLoggingMixin, LoggingCaptureMixin, colors +from tests.utils import LoggingCaptureMixin, SetupDefaultLoggingMixin, colors class SetupHandler(SetupDefaultLoggingMixin): @classmethod def setUpClass(cls): - super(SetupHandler, cls).setUpClass() - l = logging.getLogger('frontera') - l.handlers[0] = cls.handler + super().setUpClass() + logger = logging.getLogger("frontera") + logger.handlers[0] = cls.handler class TestHandlerConsole(SetupHandler, LoggingCaptureMixin, unittest.TestCase): handler = CONSOLE def test_handler_color_based_on_level(self): - self.logger.debug('debug message') - self.logger.info('info message') - self.logger.error('error message') - self.assertEqual(self.logger_output.getvalue(), - '{white}[frontera] debug message{reset}\n' - '{green}[frontera] info message{reset}\n' - '{red}[frontera] error message{reset}\n'.format(white=colors['white'], - green=colors['green'], - red=colors['red'], - reset=colors['reset'])) + self.logger.debug("debug message") + self.logger.info("info message") + self.logger.error("error message") + self.assertEqual( + self.logger_output.getvalue(), + "{white}[frontera] debug message{reset}\n" + "{green}[frontera] info message{reset}\n" + "{red}[frontera] error message{reset}\n".format( + white=colors["white"], + green=colors["green"], + red=colors["red"], + reset=colors["reset"], + ), + ) diff --git a/tests/test_message_bus.py b/tests/test_message_bus.py deleted file mode 100644 index b283a7405..000000000 --- a/tests/test_message_bus.py +++ /dev/null @@ -1,238 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from frontera.settings import Settings -from frontera.contrib.messagebus.zeromq import MessageBus as ZeroMQMessageBus -from frontera.contrib.messagebus.kafkabus import MessageBus as KafkaMessageBus, Consumer as KafkaConsumer -from frontera.utils.fingerprint import sha1 -from kafka import KafkaClient -from random import randint -from time import sleep -from six.moves import range -import logging -from sys import stdout -import unittest -from w3lib.util import to_bytes - - -class MessageBusTester(object): - def __init__(self, cls, settings=Settings()): - settings.set('SPIDER_FEED_PARTITIONS', 1) - settings.set('SPIDER_LOG_PARTITIONS', 1) - settings.set('QUEUE_HOSTNAME_PARTITIONING', True) - self.messagebus = cls(settings) - spiderlog = self.messagebus.spider_log() - - # sw - self.sw_sl_c = spiderlog.consumer(partition_id=0, type=b'sw') - - scoring_log = self.messagebus.scoring_log() - self.sw_us_p = scoring_log.producer() - - sleep(0.1) - - # db - self.db_sl_c = spiderlog.consumer(partition_id=None, type=b'db') - self.db_us_c = scoring_log.consumer() - - spider_feed = self.messagebus.spider_feed() - self.db_sf_p = spider_feed.producer() - - sleep(0.1) - - # spider - self.sp_sl_p = spiderlog.producer() - self.sp_sf_c = spider_feed.consumer(0) - - sleep(0.1) - - def spider_log_activity(self, messages): - for i in range(0, messages): - if i % 2 == 0: - self.sp_sl_p.send(sha1(str(randint(1, 1000))), b'http://helloworld.com/way/to/the/sun/' + b'0') - else: - self.sp_sl_p.send(sha1(str(randint(1, 1000))), b'http://way.to.the.sun' + b'0') - self.sp_sl_p.flush() - - def spider_feed_activity(self): - sf_c = 0 - for m in self.sp_sf_c.get_messages(timeout=1.0, count=512): - sf_c += 1 - return sf_c - - def sw_activity(self): - c = 0 - p = 0 - for m in self.sw_sl_c.get_messages(timeout=1.0, count=512): - if m.startswith(b'http://helloworld.com/'): - p += 1 - self.sw_us_p.send(None, b'message' + b'0' + b"," + to_bytes(str(c))) - c += 1 - assert p > 0 - return c - - def db_activity(self, messages): - - sl_c = 0 - us_c = 0 - - for m in self.db_sl_c.get_messages(timeout=1.0, count=512): - sl_c += 1 - for m in self.db_us_c.get_messages(timeout=1.0, count=512): - us_c += 1 - for i in range(0, messages): - if i % 2 == 0: - self.db_sf_p.send(b"newhost", b"http://newhost/new/url/to/crawl") - else: - self.db_sf_p.send(b"someotherhost", b"http://newhost223/new/url/to/crawl") - self.db_sf_p.flush() - return (sl_c, us_c) - - -class KafkaConsumerPolling(object): - """ - This is needed to adapt for Kafka client zero-result attempts to consume messages from topic. There are reasons - why this could happen: offset out of range or assignment/subscription problems. - """ - def __init__(self, consumer): - self._consumer = consumer - self._buffer = [] - result = self._consumer.get_messages() - self._buffer.extend(result) - - def get_messages(self, timeout=0.1, count=1): - result = [] - tries = 2 - while tries and len(result) < count: - if self._buffer: - result.extend(self._buffer[:count]) - self._buffer = self._buffer[count:] - else: - result.extend(self._consumer.get_messages(timeout=timeout, count=count)) - tries -= 1 - return result - - def close(self): - self._consumer.close() - - - -class KafkaMessageBusTest(unittest.TestCase): - def setUp(self): - logging.basicConfig() - handler = logging.StreamHandler(stdout) - logger = logging.getLogger("kafka") - logger.setLevel(logging.DEBUG) - logger.addHandler(handler) - - kafka_location = "127.0.0.1:9092" - client = KafkaClient(kafka_location) - client.ensure_topic_exists("frontier-todo") - client.ensure_topic_exists("frontier-done") - client.ensure_topic_exists("frontier-score") - client.close() - - settings = Settings() - settings.set('KAFKA_LOCATION', kafka_location) - settings.set('SPIDER_FEED_PARTITIONS', 1) - settings.set('SPIDER_LOG_PARTITIONS', 1) - settings.set('QUEUE_HOSTNAME_PARTITIONING', True) - self.messagebus = KafkaMessageBus(settings) - spiderlog = self.messagebus.spider_log() - - # sw - self.sw_sl_c = KafkaConsumerPolling(spiderlog.consumer(partition_id=0, type=b'sw')) - - scoring_log = self.messagebus.scoring_log() - self.sw_us_p = scoring_log.producer() - - # db - self.db_sl_c = KafkaConsumerPolling(spiderlog.consumer(partition_id=None, type=b'db')) - self.db_us_c = KafkaConsumerPolling(scoring_log.consumer()) - - spider_feed = self.messagebus.spider_feed() - self.db_sf_p = spider_feed.producer() - - # spider - self.sp_sl_p = spiderlog.producer() - self.sp_sf_c = KafkaConsumerPolling(spider_feed.consumer(partition_id=0)) - - def tearDown(self): - self.sw_us_p.close() - self.db_sf_p.close() - self.sp_sl_p.close() - - self.sw_sl_c.close() - self.db_sl_c.close() - self.db_us_c.close() - self.sp_sf_c.close() - - def spider_log_activity(self, messages): - for i in range(0, messages): - if i % 2 == 0: - self.sp_sl_p.send(sha1(str(randint(1, 1000))), b'http://helloworld.com/way/to/the/sun/' + b'0') - else: - self.sp_sl_p.send(sha1(str(randint(1, 1000))), b'http://way.to.the.sun' + b'0') - self.sp_sl_p.flush() - - def spider_feed_activity(self): - sf_c = 0 - for m in self.sp_sf_c.get_messages(timeout=0.1, count=512): - sf_c += 1 - return sf_c - - def sw_activity(self): - c = 0 - p = 0 - for m in self.sw_sl_c.get_messages(timeout=0.1, count=512): - if m.startswith(b'http://helloworld.com/'): - p += 1 - self.sw_us_p.send(None, b'message' + b'0' + b"," + to_bytes(str(c))) - c += 1 - assert p > 0 - return c - - def db_activity(self, messages): - - sl_c = 0 - us_c = 0 - - for m in self.db_sl_c.get_messages(timeout=0.1, count=512): - sl_c += 1 - for m in self.db_us_c.get_messages(timeout=0.1, count=512): - us_c += 1 - for i in range(0, messages): - if i % 2 == 0: - self.db_sf_p.send(b"newhost", b"http://newhost/new/url/to/crawl") - else: - self.db_sf_p.send(b"someotherhost", b"http://newhost223/new/url/to/crawl") - self.db_sf_p.flush() - return (sl_c, us_c) - - def test_integration(self): - self.spider_log_activity(64) - assert self.sw_activity() == 64 - assert self.db_activity(128) == (64, 32) - assert self.spider_feed_activity() == 128 - - -class IPv6MessageBusTester(MessageBusTester): - """ - Same as MessageBusTester but with ipv6-localhost - """ - # TODO This class should be used for IPv6 testing. Use the broker on port - # 5570 for this test. - def __init__(self): - settings = Settings() - settings.set('ZMQ_ADDRESS', '::1') - super(IPv6MessageBusTester, self).__init__(settings) - - -def test_zmq_message_bus(): - """ - Test MessageBus with default settings, IPv6 and Star as ZMQ_ADDRESS - """ - tester = MessageBusTester(ZeroMQMessageBus) - tester.spider_log_activity(64) - assert tester.sw_activity() == 64 - assert tester.db_activity(128) == (64, 32) - assert tester.spider_feed_activity() == 128 diff --git a/tests/test_message_bus_backend.py b/tests/test_message_bus_backend.py index 68278d133..0ae59cc9c 100644 --- a/tests/test_message_bus_backend.py +++ b/tests/test_message_bus_backend.py @@ -1,29 +1,32 @@ -from __future__ import absolute_import +import pytest + +pytest.importorskip("msgpack") + import unittest from frontera.contrib.backends.remote.messagebus import MessageBusBackend -from frontera.settings import Settings from frontera.core.models import Request, Response +from frontera.settings import Settings - -r1 = Request('http://www.example.com/', meta={b'domain': {b'fingerprint': b'1'}}) -r2 = Request('http://www.scrapy.org/', meta={b'domain': {b'fingerprint': b'2'}}) -r3 = Request('http://www.test.com/some/page', meta={b'domain': {b'fingerprint': b'3'}}) +r1 = Request("http://www.example.com/", meta={b"domain": {b"fingerprint": b"1"}}) +r2 = Request("http://www.scrapy.org/", meta={b"domain": {b"fingerprint": b"2"}}) +r3 = Request("http://www.test.com/some/page", meta={b"domain": {b"fingerprint": b"3"}}) class TestMessageBusBackend(unittest.TestCase): - def mbb_setup(self, settings=None): - manager = type('manager', (object,), {}) + manager = type("manager", (object,), {}) settings = settings or Settings() - settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' + settings.MESSAGE_BUS = "tests.mocks.message_bus.FakeMessageBus" settings.STORE_CONTENT = True manager.settings = settings manager.request_model = Request manager.response_model = Response return MessageBusBackend(manager) - def test_feed_partitions_less_than_equal_partion_id_and_partion_id_less_than_zero(self): + def test_feed_partitions_less_than_equal_partion_id_and_partion_id_less_than_zero( + self, + ): settings = Settings() # test partition_id > feed_partitions settings.SPIDER_PARTITION_ID = 2 @@ -42,41 +45,49 @@ def test_add_seeds(self): mbb = self.mbb_setup() mbb.add_seeds([r1, r2, r3]) seeds = [mbb._decoder.decode(m)[1][0] for m in mbb.spider_log_producer.messages] - self.assertEqual(set([seed.url for seed in seeds]), set([r1.url, r2.url, r3.url])) + self.assertEqual({seed.url for seed in seeds}, {r1.url, r2.url, r3.url}) def test_page_crawled(self): mbb = self.mbb_setup() - resp = Response(r1.url, body='body', request=r1) + resp = Response(r1.url, body="body", request=r1) mbb.page_crawled(resp) page = mbb._decoder.decode(mbb.spider_log_producer.messages[0])[1] - self.assertEqual((page.request.url, page.body), (resp.request.url, 'body')) + self.assertEqual((page.request.url, page.body), (resp.request.url, "body")) def test_links_extracted(self): mbb = self.mbb_setup() mbb.links_extracted(r1, [r2, r3]) requests = [mbb._decoder.decode(m)[1] for m in mbb.spider_log_producer.messages] links = [mbb._decoder.decode(m)[2][0] for m in mbb.spider_log_producer.messages] - self.assertEqual(set([r.url for r in requests]), set([r1.url])) - self.assertEqual(set([link.url for link in links]), set([r2.url, r3.url])) + self.assertEqual({r.url for r in requests}, {r1.url}) + self.assertEqual({link.url for link in links}, {r2.url, r3.url}) def test_request_error(self): mbb = self.mbb_setup() - mbb.request_error(r1, 'error') - _, error_request, error_message = mbb._decoder.decode(mbb.spider_log_producer.messages[0]) - self.assertEqual((error_request.url, error_message), (r1.url, 'error')) + mbb.request_error(r1, "error") + _, error_request, error_message = mbb._decoder.decode( + mbb.spider_log_producer.messages[0] + ) + self.assertEqual((error_request.url, error_message), (r1.url, "error")) def test_get_next_requests(self): mbb = self.mbb_setup() encoded_requests = [mbb._encoder.encode_request(r) for r in [r1, r2, r3]] mbb.consumer.put_messages(encoded_requests) mbb.consumer._set_offset(0) - requests = set(mbb.get_next_requests(10, overused_keys=[], key_type='domain')) - _, partition_id, offset = mbb._decoder.decode(mbb.spider_log_producer.messages[0]) + requests = set(mbb.get_next_requests(10, overused_keys=[], key_type="domain")) + _, partition_id, offset = mbb._decoder.decode( + mbb.spider_log_producer.messages[0] + ) self.assertEqual((partition_id, offset), (0, 0)) - self.assertEqual(set([r.url for r in requests]), set([r1.url, r2.url, r3.url])) - requests = set(mbb.get_next_requests(10, overused_keys=[], key_type='domain')) + self.assertEqual({r.url for r in requests}, {r1.url, r2.url, r3.url}) + requests = set(mbb.get_next_requests(10, overused_keys=[], key_type="domain")) self.assertEqual([r.url for r in requests], []) # test overused keys mbb.consumer.put_messages(encoded_requests) - requests = set(mbb.get_next_requests(10, overused_keys=['www.example.com'], key_type='domain')) - self.assertEqual(set([r.url for r in requests]), set([r2.url, r3.url])) + requests = set( + mbb.get_next_requests( + 10, overused_keys=["www.example.com"], key_type="domain" + ) + ) + self.assertEqual({r.url for r in requests}, {r2.url, r3.url}) diff --git a/tests/test_message_bus_kafka.py b/tests/test_message_bus_kafka.py new file mode 100644 index 000000000..c8bbdf17a --- /dev/null +++ b/tests/test_message_bus_kafka.py @@ -0,0 +1,161 @@ +import pytest + +pytest.importorskip("kafka") + +import logging +import unittest +from random import randint +from sys import stdout + +from kafka import KafkaClient +from kafka.errors import KafkaUnavailableError, NoBrokersAvailable +from w3lib.util import to_bytes + +from frontera.contrib.messagebus.kafkabus import MessageBus as KafkaMessageBus +from frontera.settings import Settings +from frontera.utils.fingerprint import sha1 + + +class KafkaConsumerPolling: + """ + This is needed to adapt for Kafka client zero-result attempts to consume messages from topic. There are reasons + why this could happen: offset out of range or assignment/subscription problems. + """ + + def __init__(self, consumer): + self._consumer = consumer + self._buffer = [] + result = self._consumer.get_messages() + self._buffer.extend(result) + + def get_messages(self, timeout=0.1, count=1): + result = [] + tries = 2 + while tries and len(result) < count: + if self._buffer: + result.extend(self._buffer[:count]) + self._buffer = self._buffer[count:] + else: + result.extend(self._consumer.get_messages(timeout=timeout, count=count)) + tries -= 1 + return result + + def close(self): + self._consumer.close() + + +class KafkaMessageBusTest(unittest.TestCase): + def setUp(self): + logging.basicConfig() + handler = logging.StreamHandler(stdout) + logger = logging.getLogger("kafka") + logger.setLevel(logging.DEBUG) + logger.addHandler(handler) + + kafka_location = "127.0.0.1:9092" + try: + client = KafkaClient(bootstrap_servers=kafka_location) + except TypeError: # old kafka-python + try: + client = KafkaClient(kafka_location) + except KafkaUnavailableError: + raise self.skipTest("No running kafka service") + except NoBrokersAvailable: + raise self.skipTest("No running kafka service") + client.ensure_topic_exists("frontier-todo") + client.ensure_topic_exists("frontier-done") + client.ensure_topic_exists("frontier-score") + client.close() + + settings = Settings() + settings.set("KAFKA_LOCATION", kafka_location) + settings.set("SPIDER_FEED_PARTITIONS", 1) + settings.set("SPIDER_LOG_PARTITIONS", 1) + settings.set("QUEUE_HOSTNAME_PARTITIONING", True) + self.messagebus = KafkaMessageBus(settings) + spiderlog = self.messagebus.spider_log() + + # sw + self.sw_sl_c = KafkaConsumerPolling( + spiderlog.consumer(partition_id=0, type=b"sw") + ) + + scoring_log = self.messagebus.scoring_log() + self.sw_us_p = scoring_log.producer() + + # db + self.db_sl_c = KafkaConsumerPolling( + spiderlog.consumer(partition_id=None, type=b"db") + ) + self.db_us_c = KafkaConsumerPolling(scoring_log.consumer()) + + spider_feed = self.messagebus.spider_feed() + self.db_sf_p = spider_feed.producer() + + # spider + self.sp_sl_p = spiderlog.producer() + self.sp_sf_c = KafkaConsumerPolling(spider_feed.consumer(partition_id=0)) + + def tearDown(self): + self.sw_us_p.close() + self.db_sf_p.close() + self.sp_sl_p.close() + + self.sw_sl_c.close() + self.db_sl_c.close() + self.db_us_c.close() + self.sp_sf_c.close() + + def spider_log_activity(self, messages): + for i in range(messages): + if i % 2 == 0: + self.sp_sl_p.send( + sha1(str(randint(1, 1000))), + b"http://helloworld.com/way/to/the/sun/" + b"0", + ) + else: + self.sp_sl_p.send( + sha1(str(randint(1, 1000))), b"http://way.to.the.sun" + b"0" + ) + self.sp_sl_p.flush() + + def spider_feed_activity(self): + sf_c = 0 + for _m in self.sp_sf_c.get_messages(timeout=0.1, count=512): + sf_c += 1 + return sf_c + + def sw_activity(self): + c = 0 + p = 0 + for m in self.sw_sl_c.get_messages(timeout=0.1, count=512): + if m.startswith(b"http://helloworld.com/"): + p += 1 + self.sw_us_p.send(None, b"message" + b"0" + b"," + to_bytes(str(c))) + c += 1 + assert p > 0 + return c + + def db_activity(self, messages): + sl_c = 0 + us_c = 0 + + for _m in self.db_sl_c.get_messages(timeout=0.1, count=512): + sl_c += 1 + for _m in self.db_us_c.get_messages(timeout=0.1, count=512): + us_c += 1 + for i in range(messages): + if i % 2 == 0: + self.db_sf_p.send(b"newhost", b"http://newhost/new/url/to/crawl") + else: + self.db_sf_p.send( + b"someotherhost", b"http://newhost223/new/url/to/crawl" + ) + self.db_sf_p.flush() + return (sl_c, us_c) + + def test_integration(self): + self.spider_log_activity(64) + assert self.sw_activity() == 64 + assert self.db_activity(128) == (64, 32) + assert self.spider_feed_activity() == 128 diff --git a/tests/test_message_bus_zeromq.py b/tests/test_message_bus_zeromq.py new file mode 100644 index 000000000..26b442d66 --- /dev/null +++ b/tests/test_message_bus_zeromq.py @@ -0,0 +1,123 @@ +import pytest + +pytest.importorskip("zmq") + +from random import randint +from time import sleep +from unittest import SkipTest + +from flaky import flaky +from w3lib.util import to_bytes + +from frontera.contrib.messagebus.zeromq import MessageBus as ZeroMQMessageBus +from frontera.settings import Settings +from frontera.utils.fingerprint import sha1 + + +class MessageBusTester: + def __init__(self, cls, settings=None): + if settings is None: + settings = Settings() + settings.set("SPIDER_FEED_PARTITIONS", 1) + settings.set("SPIDER_LOG_PARTITIONS", 1) + settings.set("QUEUE_HOSTNAME_PARTITIONING", True) + self.messagebus = cls(settings) + spiderlog = self.messagebus.spider_log() + + # sw + self.sw_sl_c = spiderlog.consumer(partition_id=0, type=b"sw") + + scoring_log = self.messagebus.scoring_log() + self.sw_us_p = scoring_log.producer() + + sleep(0.1) + + # db + self.db_sl_c = spiderlog.consumer(partition_id=None, type=b"db") + self.db_us_c = scoring_log.consumer() + + spider_feed = self.messagebus.spider_feed() + self.db_sf_p = spider_feed.producer() + + sleep(0.1) + + # spider + self.sp_sl_p = spiderlog.producer() + self.sp_sf_c = spider_feed.consumer(0) + + sleep(0.1) + + def spider_log_activity(self, messages): + for i in range(messages): + if i % 2 == 0: + self.sp_sl_p.send( + sha1(str(randint(1, 1000))), + b"http://helloworld.com/way/to/the/sun/" + b"0", + ) + else: + self.sp_sl_p.send( + sha1(str(randint(1, 1000))), b"http://way.to.the.sun" + b"0" + ) + self.sp_sl_p.flush() + + def spider_feed_activity(self): + sf_c = 0 + for _m in self.sp_sf_c.get_messages(timeout=1.0, count=512): + sf_c += 1 + return sf_c + + def sw_activity(self): + c = 0 + p = 0 + for m in self.sw_sl_c.get_messages(timeout=1.0, count=512): + if m.startswith(b"http://helloworld.com/"): + p += 1 + self.sw_us_p.send(None, b"message" + b"0" + b"," + to_bytes(str(c))) + c += 1 + if p == 0: + raise SkipTest("No running zeromq service") + assert p > 0 + return c + + def db_activity(self, messages): + sl_c = 0 + us_c = 0 + + for _m in self.db_sl_c.get_messages(timeout=1.0, count=512): + sl_c += 1 + for _m in self.db_us_c.get_messages(timeout=1.0, count=512): + us_c += 1 + for i in range(messages): + if i % 2 == 0: + self.db_sf_p.send(b"newhost", b"http://newhost/new/url/to/crawl") + else: + self.db_sf_p.send( + b"someotherhost", b"http://newhost223/new/url/to/crawl" + ) + self.db_sf_p.flush() + return (sl_c, us_c) + + +class IPv6MessageBusTester(MessageBusTester): + """ + Same as MessageBusTester but with ipv6-localhost + """ + + # TODO This class should be used for IPv6 testing. Use the broker on port + # 5570 for this test. + def __init__(self): + settings = Settings() + settings.set("ZMQ_ADDRESS", "::1") + super().__init__(settings) + + +@flaky +def test_zmq_message_bus(): + """ + Test MessageBus with default settings, IPv6 and Star as ZMQ_ADDRESS + """ + tester = MessageBusTester(ZeroMQMessageBus) + tester.spider_log_activity(64) + assert tester.sw_activity() == 64 + assert tester.db_activity(128) == (64, 32) + assert tester.spider_feed_activity() == 128 diff --git a/tests/test_overused_buffer.py b/tests/test_overused_buffer.py index 96524a4c5..96cdc0469 100644 --- a/tests/test_overused_buffer.py +++ b/tests/test_overused_buffer.py @@ -1,34 +1,59 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from tests.backends import BackendSequenceTest, TEST_SITES -from frontera.utils.tester import DownloaderSimulator, BaseDownloaderSimulator -from six.moves.urllib.parse import urlparse +import pytest +sqlalchemy = pytest.importorskip("sqlalchemy.engine") + +from urllib.parse import urlparse + +from frontera.utils.tester import DownloaderSimulator +from tests.backends import TEST_SITES, BackendSequenceTest -class DFSOverusedBackendTest(BackendSequenceTest): +class DFSOverusedBackendTest(BackendSequenceTest): EXPECTED_SEQUENCES = { "SEQUENCE_01_A": [ - 'https://www.a.com', 'http://b.com', 'http://www.a.com/2', 'http://www.a.com/2/1', 'http://www.a.com/3', - 'http://www.a.com/2/1/3', 'http://www.a.com/2/4/1', 'http://www.a.net', 'http://b.com/2', - 'http://test.cloud.c.com', 'http://cloud.c.com', 'http://test.cloud.c.com/2', - 'http://b.com/entries?page=2', 'http://www.a.com/2/4/2' + "https://www.a.com", + "http://b.com", + "http://www.a.com/2", + "http://www.a.com/2/1", + "http://www.a.com/3", + "http://www.a.com/2/1/3", + "http://www.a.com/2/4/1", + "http://www.a.net", + "http://b.com/2", + "http://test.cloud.c.com", + "http://cloud.c.com", + "http://test.cloud.c.com/2", + "http://b.com/entries?page=2", + "http://www.a.com/2/4/2", ], "SEQUENCE_02_A": [ - 'https://www.a.com', 'http://b.com', 'http://www.a.com/2', 'http://www.a.com/2/1', 'http://www.a.com/3', - 'http://www.a.com/2/1/3', 'http://www.a.com/2/4/1', 'http://www.a.com/2/4/2', 'http://www.a.net', - 'http://b.com/2', 'http://test.cloud.c.com', 'http://cloud.c.com', 'http://test.cloud.c.com/2', - 'http://b.com/entries?page=2' - ] + "https://www.a.com", + "http://b.com", + "http://www.a.com/2", + "http://www.a.com/2/1", + "http://www.a.com/3", + "http://www.a.com/2/1/3", + "http://www.a.com/2/4/1", + "http://www.a.com/2/4/2", + "http://www.a.net", + "http://b.com/2", + "http://test.cloud.c.com", + "http://cloud.c.com", + "http://test.cloud.c.com/2", + "http://b.com/entries?page=2", + ], } def test_sequence1(self): - sequence = self.get_sequence(TEST_SITES['SITE_09'], max_next_requests=5, - downloader_simulator=DownloaderSimulator(rate=1)) + sequence = self.get_sequence( + TEST_SITES["SITE_09"], + max_next_requests=5, + downloader_simulator=DownloaderSimulator(rate=1), + ) assert len(sequence) == 7 all_domains = set() - for requests, iteration, dl_info in sequence: - overused_keys = set(dl_info['overused_keys']) + for requests, _iteration, dl_info in sequence: + overused_keys = set(dl_info["overused_keys"]) for r in requests: url = urlparse(r.url) all_domains.add(url.hostname) diff --git a/tests/test_partitioners.py b/tests/test_partitioners.py index 61f52ada8..6fe20e699 100644 --- a/tests/test_partitioners.py +++ b/tests/test_partitioners.py @@ -1,13 +1,13 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from frontera.contrib.backends.partitioners import FingerprintPartitioner, Crc32NamePartitioner -from six.moves import range +from frontera.contrib.backends.partitioners import ( + Crc32NamePartitioner, + FingerprintPartitioner, +) def test_fingerprint_partitioner(): - partitions = list(range(0, 5)) + partitions = list(range(5)) fp = FingerprintPartitioner(partitions) - key = '1be68ff556fd0bbe5802d1a100850da29f7f15b1' + key = "1be68ff556fd0bbe5802d1a100850da29f7f15b1" partition = fp.partition(key, partitions) assert partition == 4 @@ -16,9 +16,9 @@ def test_fingerprint_partitioner(): def test_crc32name_partitioner(): - partitions = list(range(0, 5)) + partitions = list(range(5)) cp = Crc32NamePartitioner(partitions) - key = '1be68ff556fd0bbe5802d1a100850da29f7f15b11' + key = "1be68ff556fd0bbe5802d1a100850da29f7f15b11" partition = cp.partition(key, partitions) assert partition == 3 @@ -27,4 +27,3 @@ def test_crc32name_partitioner(): partition = cp.partition(key, None) assert partition == 3 - diff --git a/tests/test_revisiting_backend.py b/tests/test_revisiting_backend.py index 006fd874a..78609aafb 100644 --- a/tests/test_revisiting_backend.py +++ b/tests/test_revisiting_backend.py @@ -1,12 +1,13 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from tests.backends import BackendSequenceTest, TEST_SITES -from frontera.utils.tester import FrontierTester +import pytest + +sqlalchemy = pytest.importorskip("sqlalchemy.engine") from datetime import timedelta -import pytest from time import sleep +from frontera.utils.tester import FrontierTester +from tests.backends import TEST_SITES, BackendSequenceTest + class RevisitingFrontierTester(FrontierTester): def run(self, add_all_pages=False): @@ -28,24 +29,24 @@ def run(self, add_all_pages=False): class RevisitingBackendTest(BackendSequenceTest): - def get_settings(self): - settings = super(RevisitingBackendTest, self).get_settings() + settings = super().get_settings() settings.set("SQLALCHEMYBACKEND_REVISIT_INTERVAL", timedelta(seconds=2)) - settings.SQLALCHEMYBACKEND_ENGINE = 'sqlite:///:memory:' + settings.SQLALCHEMYBACKEND_ENGINE = "sqlite:///:memory:" return settings @pytest.mark.parametrize( - ('site_list', 'max_next_requests'), [ - ('SITE_01', 5), - ('SITE_02', 10), - ] + ("site_list", "max_next_requests"), + [ + ("SITE_01", 5), + ("SITE_02", 10), + ], ) def test_sequence(self, site_list, max_next_requests): sequence = self.get_url_sequence( site_list=TEST_SITES[site_list], max_next_requests=max_next_requests, - frontier_tester=RevisitingFrontierTester + frontier_tester=RevisitingFrontierTester, ) seen = set() for url in sequence: @@ -53,5 +54,4 @@ def test_sequence(self, site_list, max_next_requests): return seen.add(url) - assert False, "None of the URLs were revisted" - + raise AssertionError("None of the URLs were revisted") diff --git a/tests/test_scrapy.py b/tests/test_scrapy.py index e29608001..78584cf24 100644 --- a/tests/test_scrapy.py +++ b/tests/test_scrapy.py @@ -1,14 +1,16 @@ -# -*- coding: utf-8 -*- +import pytest + +pytest.importorskip("scrapy") -from __future__ import absolute_import -from frontera.contrib.scrapy.converters import RequestConverter, ResponseConverter from scrapy.http.request import Request as ScrapyRequest from scrapy.http.response import Response as ScrapyResponse -from frontera.core.models import Request as FrontierRequest from w3lib.util import to_bytes +from frontera.contrib.scrapy.converters import RequestConverter, ResponseConverter +from frontera.core.models import Request as FrontierRequest + -class TestSpider(object): +class TestSpider: def callback(self): pass @@ -26,51 +28,56 @@ def test_request_response_converters(): rsc = ResponseConverter(spider, rc) url = "http://test.com/test?param=123" - request = ScrapyRequest(url=url, callback=spider.callback, errback=spider.errback, - body=REQUEST_BODY) - request.meta[b'test_param'] = b'test_value' + request = ScrapyRequest( + url=url, callback=spider.callback, errback=spider.errback, body=REQUEST_BODY + ) + request.meta[b"test_param"] = b"test_value" request.headers.appendlist(b"TestKey", b"test value") - request.cookies[b'MyCookie'] = b'CookieContent' + request.cookies[b"MyCookie"] = b"CookieContent" frontier_request = rc.to_frontier(request) - assert frontier_request.meta[b'scrapy_callback'] == b'callback' - assert frontier_request.meta[b'scrapy_errback'] == b'errback' + assert frontier_request.meta[b"scrapy_callback"] == b"callback" + assert frontier_request.meta[b"scrapy_errback"] == b"errback" assert frontier_request.body == to_bytes(REQUEST_BODY) assert frontier_request.url == url - assert frontier_request.method == b'GET' - assert frontier_request.headers[b'Testkey'] == b'test value' - assert frontier_request.cookies[b'MyCookie'] == b'CookieContent' - assert b'frontier_request' not in frontier_request.meta[b'scrapy_meta'] + assert frontier_request.method == b"GET" + assert frontier_request.headers[b"Testkey"] == b"test value" + assert frontier_request.cookies[b"MyCookie"] == b"CookieContent" + assert b"frontier_request" not in frontier_request.meta[b"scrapy_meta"] request_converted = rc.from_frontier(frontier_request) - assert request_converted.meta[b'test_param'] == b'test_value' + assert request_converted.meta[b"test_param"] == b"test_value" assert request_converted.body == to_bytes(REQUEST_BODY) assert request_converted.url == url - assert request_converted.method == 'GET' - assert request_converted.cookies[b'MyCookie'] == b'CookieContent' - assert request_converted.headers.get(b'Testkey') == b'test value' + assert request_converted.method == "GET" + assert request_converted.cookies[b"MyCookie"] == b"CookieContent" + assert request_converted.headers.get(b"Testkey") == b"test value" assert request_converted.callback == spider.callback assert request_converted.errback == spider.errback # Some middleware could change .meta contents - request_converted.meta[b'middleware_stuff'] = b'appeared' + request_converted.meta[b"middleware_stuff"] = b"appeared" - response = ScrapyResponse(url=url, request=request_converted, body=RESPONSE_BODY, - headers={b'TestHeader': b'Test value'}) + response = ScrapyResponse( + url=url, + request=request_converted, + body=RESPONSE_BODY, + headers={b"TestHeader": b"Test value"}, + ) frontier_response = rsc.to_frontier(response) assert frontier_response.body == RESPONSE_BODY - assert frontier_response.meta[b'scrapy_meta'][b'test_param'] == b'test_value' - assert frontier_response.meta[b'scrapy_meta'][b'middleware_stuff'] == b'appeared' + assert frontier_response.meta[b"scrapy_meta"][b"test_param"] == b"test_value" + assert frontier_response.meta[b"scrapy_meta"][b"middleware_stuff"] == b"appeared" assert frontier_response.status_code == 200 - assert b'frontier_request' not in frontier_response.meta[b'scrapy_meta'] + assert b"frontier_request" not in frontier_response.meta[b"scrapy_meta"] response_converted = rsc.from_frontier(frontier_response) assert response_converted.body == RESPONSE_BODY - assert response_converted.meta[b'test_param'] == b'test_value' + assert response_converted.meta[b"test_param"] == b"test_value" assert response_converted.url == url assert response_converted.status == 200 - assert response_converted.headers[b'TestHeader'] == b'Test value' + assert response_converted.headers[b"TestHeader"] == b"Test value" frontier_request = FrontierRequest(url) request_converted = rc.from_frontier(frontier_request) diff --git a/tests/test_scrapy_settings_adapter.py b/tests/test_scrapy_settings_adapter.py index 87a352357..03e823e19 100644 --- a/tests/test_scrapy_settings_adapter.py +++ b/tests/test_scrapy_settings_adapter.py @@ -1,16 +1,19 @@ -# -*- coding: utf-8 -*- +import pytest + +pytest.importorskip("scrapy") -from __future__ import absolute_import from frontera.contrib.scrapy.settings_adapter import ScrapySettingsAdapter def test_fallsback_to_crawler_settings(): - settings = ScrapySettingsAdapter({'DELAY_ON_EMPTY': 10}) - assert settings.get('DELAY_ON_EMPTY') == 10 + settings = ScrapySettingsAdapter({"DELAY_ON_EMPTY": 10}) + assert settings.get("DELAY_ON_EMPTY") == 10 def test_frontera_settings_have_precedence_over_crawler_settings(): - crawler_settings = {'MAX_REQUESTS': 10, - 'FRONTERA_SETTINGS': 'tests.scrapy_spider.frontera.settings'} + crawler_settings = { + "MAX_REQUESTS": 10, + "FRONTERA_SETTINGS": "tests.scrapy_spider.frontera.settings", + } settings = ScrapySettingsAdapter(crawler_settings) - assert settings.get('MAX_REQUESTS') == 5 + assert settings.get("MAX_REQUESTS") == 5 diff --git a/tests/test_scrapy_spider.py b/tests/test_scrapy_spider.py index 0b797ddcd..1dca4ccdf 100644 --- a/tests/test_scrapy_spider.py +++ b/tests/test_scrapy_spider.py @@ -1,19 +1,33 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from twisted.internet import reactor -from scrapy.crawler import Crawler +import pytest + +pytest.importorskip("scrapy") + from scrapy import signals +from scrapy.crawler import Crawler from scrapy.settings import Settings + from tests.scrapy_spider.spiders.example import MySpider def test_scrapy_spider(): + from scrapy.settings.default_settings import TWISTED_REACTOR + from scrapy.utils.misc import load_object + from twisted.internet.asyncioreactor import AsyncioSelectorReactor + + default_reactor = load_object(TWISTED_REACTOR) + if default_reactor == AsyncioSelectorReactor: + from scrapy.utils.reactor import install_reactor + + install_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor") + + from twisted.internet import reactor + settings = Settings() settings.setmodule("tests.scrapy_spider.settings") crawler = Crawler(MySpider, settings=settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.crawl() reactor.run() - stats = crawler.stats.spider_stats['example'] - assert stats['frontera/crawled_pages_count'] == 5 + stats = crawler.stats.spider_stats["example"] + assert stats["frontera/crawled_pages_count"] == 1 assert crawler.spider.callback_calls > 0 diff --git a/tests/test_seed_loader.py b/tests/test_seed_loader.py index bc512e2a9..5313c871f 100644 --- a/tests/test_seed_loader.py +++ b/tests/test_seed_loader.py @@ -1,56 +1,62 @@ -import os +import pytest + +pytest.importorskip("boto") +pytest.importorskip("scrapy") + import unittest +from pathlib import Path from shutil import rmtree from tempfile import mkdtemp +from unittest import mock from scrapy.spiders import Spider -from frontera.settings import Settings from frontera.contrib.scrapy.middlewares.seeds.file import FileSeedLoader, NotConfigured from frontera.contrib.scrapy.middlewares.seeds.s3 import S3SeedLoader - +from frontera.settings import Settings from tests.mocks.boto import MockConnection -from tests import mock class TestFileSeedLoader(unittest.TestCase): - def setUp(self): - self.tmp_path = mkdtemp() + self.tmp_path = Path(mkdtemp()) def tearDown(self): rmtree(self.tmp_path) def seed_loader_setup(self, seeds_content=None): - seed_path = os.path.join(self.tmp_path, 'seeds.txt') + seed_path = self.tmp_path / "seeds.txt" default_content = """ https://www.example.com https://www.scrapy.org """ seeds_content = seeds_content or default_content - with open(seed_path, 'wb') as tmpl_file: - tmpl_file.write(seeds_content.encode('utf-8')) - assert os.path.isfile(seed_path) # Failure of test itself + with seed_path.open("wb") as tmpl_file: + tmpl_file.write(seeds_content.encode("utf-8")) + assert seed_path.is_file() # Failure of test itself settings = Settings() - settings.SEEDS_SOURCE = seed_path - crawler = type('crawler', (object,), {}) + settings.SEEDS_SOURCE = str(seed_path) + crawler = type("crawler", (object,), {}) crawler.settings = settings return FileSeedLoader(crawler) def test_seeds_not_configured(self): - crawler = type('crawler', (object,), {}) + crawler = type("crawler", (object,), {}) crawler.settings = Settings() self.assertRaises(NotConfigured, FileSeedLoader, crawler) def test_load_seeds(self): seed_loader = self.seed_loader_setup() seeds = seed_loader.load_seeds() - self.assertEqual(seeds, ['https://www.example.com', 'https://www.scrapy.org']) + self.assertEqual(seeds, ["https://www.example.com", "https://www.scrapy.org"]) def test_process_start_requests(self): seed_loader = self.seed_loader_setup() - requests = seed_loader.process_start_requests(None, Spider(name='spider')) - self.assertEqual([r.url for r in requests], ['https://www.example.com', 'https://www.scrapy.org']) + requests = seed_loader.process_start_requests(None, Spider(name="spider")) + self.assertEqual( + [r.url for r in requests], + ["https://www.example.com", "https://www.scrapy.org"], + ) def test_process_start_requests_ignore_comments(self): seeds_content = """ @@ -60,22 +66,24 @@ def test_process_start_requests_ignore_comments(self): # https://www.test.com """ seed_loader = self.seed_loader_setup(seeds_content) - requests = seed_loader.process_start_requests(None, Spider(name='spider')) - self.assertEqual([r.url for r in requests], ['https://www.example.com', 'https://www.scrapy.org']) + requests = seed_loader.process_start_requests(None, Spider(name="spider")) + self.assertEqual( + [r.url for r in requests], + ["https://www.example.com", "https://www.scrapy.org"], + ) class TestS3SeedLoader(unittest.TestCase): - def setUp(self): - self.tmp_path = mkdtemp() + self.tmp_path = Path(mkdtemp()) settings = Settings() - settings.SEEDS_SOURCE = 's3://some-bucket/seeds-folder' - settings.SEEDS_AWS_ACCESS_KEY = 'access_key' - settings.SEEDS_AWS_SECRET_ACCESS_KEY = 'secret_key' - crawler = type('crawler', (object,), {}) + settings.SEEDS_SOURCE = "s3://some-bucket/seeds-folder" + settings.SEEDS_AWS_ACCESS_KEY = "access_key" + settings.SEEDS_AWS_SECRET_ACCESS_KEY = "secret_key" + crawler = type("crawler", (object,), {}) crawler.settings = settings - self.seed_path_1 = os.path.join(self.tmp_path, 'seeds1.txt') - self.seed_path_2 = os.path.join(self.tmp_path, 'seeds2.txt') + self.seed_path_1 = self.tmp_path / "seeds1.txt" + self.seed_path_2 = self.tmp_path / "seeds2.txt" s1_content = """ https://www.example.com https://www.scrapy.org @@ -85,43 +93,50 @@ def setUp(self): https://www.test.com """ - with open(self.seed_path_1, 'wb') as tmpl_file: - tmpl_file.write(s1_content.encode('utf-8')) - with open(self.seed_path_2, 'wb') as tmpl_file: - tmpl_file.write(s2_content.encode('utf-8')) + with self.seed_path_1.open("wb") as tmpl_file: + tmpl_file.write(s1_content.encode("utf-8")) + with self.seed_path_2.open("wb") as tmpl_file: + tmpl_file.write(s2_content.encode("utf-8")) self.seed_loader = S3SeedLoader(crawler) def tearDown(self): rmtree(self.tmp_path) def test_invalid_s3_seed_source(self): - crawler = type('crawler', (object,), {}) + crawler = type("crawler", (object,), {}) settings = Settings() - settings.SEEDS_SOURCE = 'invalid_url' + settings.SEEDS_SOURCE = "invalid_url" crawler.settings = settings self.assertRaises(NotConfigured, S3SeedLoader, crawler) def test_process_start_requests(self): - urls = ['https://www.example.com', 'https://www.scrapy.org', - 'https://www.dmoz.org', 'https://www.test.com'] + urls = [ + "https://www.example.com", + "https://www.scrapy.org", + "https://www.dmoz.org", + "https://www.test.com", + ] self.check_request_urls(urls) def test_s3_loader_ignores_non_txt_files(self): urls = [] - self.check_request_urls(urls, '.ini') - - def check_request_urls(self, urls, key_extension='.txt'): - with open(self.seed_path_1, 'rU') as s1: - with open(self.seed_path_2, 'rU') as s2: - conn = MockConnection() - bucket = conn.create_bucket('some-bucket') - bucket.add_key('seeds-folder/seeds1%s' % key_extension, s1) - bucket.add_key('seeds-folder/seeds2%s' % key_extension, s2) - - def mocked_connect_s3(*args, **kwargs): - return conn - - with mock.patch('frontera.contrib.scrapy.middlewares.seeds.s3.connect_s3', - side_effect=mocked_connect_s3): - requests = self.seed_loader.process_start_requests(None, Spider(name='spider')) - self.assertEqual(set([r.url for r in requests]), set(urls)) + self.check_request_urls(urls, ".ini") + + def check_request_urls(self, urls, key_extension=".txt"): + with Path(self.seed_path_1).open() as s1, Path(self.seed_path_2).open() as s2: + conn = MockConnection() + bucket = conn.create_bucket("some-bucket") + bucket.add_key(f"seeds-folder/seeds1{key_extension}", s1) + bucket.add_key(f"seeds-folder/seeds2{key_extension}", s2) + + def mocked_connect_s3(*args, **kwargs): + return conn + + with mock.patch( + "frontera.contrib.scrapy.middlewares.seeds.s3.connect_s3", + side_effect=mocked_connect_s3, + ): + requests = self.seed_loader.process_start_requests( + None, Spider(name="spider") + ) + self.assertEqual({r.url for r in requests}, set(urls)) diff --git a/tests/test_settings.py b/tests/test_settings.py index a50d6970d..1bba57efb 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -1,22 +1,19 @@ -# -*- coding: utf-8 -*- - -from __future__ import absolute_import -from frontera.settings import Settings, BaseSettings +from frontera.settings import BaseSettings, Settings def test_settings_on_a_python_module_are_loaded(): - settings = Settings('tests.scrapy_spider.frontera.settings') - assert settings.get('MAX_REQUESTS') == 5 + settings = Settings("tests.scrapy_spider.frontera.settings") + assert settings.get("MAX_REQUESTS") == 5 def test_settings_passed_as_attributes_can_be_found(): - settings = Settings(attributes={'SETTING': 'value'}) - assert settings.get('SETTING') == 'value' + settings = Settings(attributes={"SETTING": "value"}) + assert settings.get("SETTING") == "value" def test_fallsback_to_frontera_default_settings(): settings = Settings() - assert settings.get('MAX_NEXT_REQUESTS') == 64 + assert settings.get("MAX_NEXT_REQUESTS") == 64 def test_allows_settings_to_be_accessed_by_attribute(): @@ -31,13 +28,13 @@ def test_settings_attributes_can_be_assigned(): def test_object_from_loads_settings_from_a_module(): - module = 'tests.scrapy_spider.frontera.settings' + module = "tests.scrapy_spider.frontera.settings" settings = BaseSettings.object_from(module) - assert settings.get('MAX_REQUESTS') == 5 + assert settings.get("MAX_REQUESTS") == 5 def test_new_instance_copies_the_given_instance(): settings = Settings() new_instance = BaseSettings.object_from(settings) assert new_instance.MAX_NEXT_REQUESTS == 64 - assert type(new_instance) == Settings + assert type(new_instance) is Settings diff --git a/tests/test_strategy.py b/tests/test_strategy.py index 3c6e5dafc..45d5e269f 100644 --- a/tests/test_strategy.py +++ b/tests/test_strategy.py @@ -1,11 +1,14 @@ -# -*- coding: utf-8 -*- -from frontera.worker.strategies import BaseCrawlingStrategy -from frontera.worker.strategy import StatesContext -from frontera.settings import Settings -from tests.mocks.frontier_manager import FakeFrontierManager +import pytest + +pytest.importorskip("twisted") +sqlalchemy = pytest.importorskip("sqlalchemy.engine") from frontera.contrib.backends.memory import MemoryStates from frontera.core.components import States +from frontera.settings import Settings +from frontera.worker.strategies import BaseCrawlingStrategy +from frontera.worker.strategy import StatesContext +from tests.mocks.frontier_manager import FakeFrontierManager class DummyCrawlingStrategy(BaseCrawlingStrategy): @@ -22,7 +25,7 @@ def links_extracted(self, request, links): pass -class MessageBusStream(object): +class MessageBusStream: def send(self, request, score=1.0, dont_queue=False): pass @@ -30,7 +33,7 @@ def flush(self): pass -class TestCrawlingStrategy(object): +class TestCrawlingStrategy: def strategy(self): settings = Settings() manager = FakeFrontierManager(settings) @@ -42,17 +45,17 @@ def strategy(self): def test_create_request(self): s = self.strategy() req = s.create_request("http://test.com/someurl") - assert req.meta[b'fingerprint'] == b'955ac04f1b1a96de60a5139ad90c80be87822159' + assert req.meta[b"fingerprint"] == b"955ac04f1b1a96de60a5139ad90c80be87822159" def test_states_refresh(self): s = self.strategy() states = s._states_context._states url = "http://test.com/someurl" req1 = s.create_request(url) - req1.meta[b'state'] = States.CRAWLED + req1.meta[b"state"] = States.CRAWLED states.update_cache(req1) req2 = s.create_request(url) s.refresh_states([req2]) - assert req2.meta[b'state'] == req1.meta[b'state'] - assert req2.meta[b'state'] == States.CRAWLED + assert req2.meta[b"state"] == req1.meta[b"state"] + assert req2.meta[b"state"] == States.CRAWLED diff --git a/tests/test_utils_async.py b/tests/test_utils_async.py index bbf6d83fe..384ba53f1 100644 --- a/tests/test_utils_async.py +++ b/tests/test_utils_async.py @@ -1,14 +1,15 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import import pytest -from twisted.test.proto_helpers import MemoryReactor + +pytest.importorskip("twisted") + from twisted.internet.protocol import Factory from twisted.internet.task import Clock -from frontera.utils.async import CallLaterOnce, listen_tcp +from twisted.test.proto_helpers import MemoryReactor +from frontera.utils.async_ import CallLaterOnce, listen_tcp -class TestCallLaterOnce(object): +class TestCallLaterOnce: called = 0 def call_function(self): @@ -51,9 +52,8 @@ def test_call_later_cancel(self): assert self.called == 0 -class TestListenTCP(object): - - host = '127.0.0.1' +class TestListenTCP: + host = "127.0.0.1" port = 6023 portrange = [6023, 6073] @@ -67,7 +67,7 @@ def test_listen_tcp_invalid_port_range(self): reactor = MemoryReactor() with pytest.raises(Exception) as info: listen_tcp([1, 2, 3], self.host, Factory, reactor=reactor) - assert str(info.value) == 'invalid portrange: [1, 2, 3]' + assert str(info.value) == "invalid portrange: [1, 2, 3]" def test_listen_tcp_default(self): reactor = MemoryReactor() diff --git a/tests/test_utils_heap.py b/tests/test_utils_heap.py index 458e8e9c2..781cdf270 100644 --- a/tests/test_utils_heap.py +++ b/tests/test_utils_heap.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import from frontera.utils.heap import Heap @@ -6,8 +5,7 @@ def cmp(a, b): return (a > b) - (a < b) -class TestHeap(object): - +class TestHeap: def test_heap_order(self): heap = Heap(cmp) heap.push(5) @@ -21,7 +19,7 @@ def test_heap_order(self): assert heap.pop(1) == [] def test_heap_obj(self): - obj = type('obj', (object,), {}) + obj = type("obj", (object,), {}) a = obj() a.score = 3 b = obj() diff --git a/tests/test_utils_misc.py b/tests/test_utils_misc.py index af6f6d992..95749f3f7 100644 --- a/tests/test_utils_misc.py +++ b/tests/test_utils_misc.py @@ -1,39 +1,36 @@ -from __future__ import absolute_import import hashlib + import pytest -from frontera.utils.misc import load_object, get_crc32, chunks, to_signed32 -import six +from frontera.utils.misc import chunks, get_crc32, load_object, to_signed32 -class TestGetCRC32(object): +class TestGetCRC32: def test_bytes(self): - assert get_crc32(b'example') == 1861000095 + assert get_crc32(b"example") == 1861000095 def test_ascii_unicode(self): - assert get_crc32(u'example') == 1861000095 + assert get_crc32("example") == 1861000095 def test_non_ascii_unicode(self): - assert get_crc32(u'example\u5000') == 1259721235 + assert get_crc32("example\u5000") == 1259721235 def test_non_ascii_bytes(self): - assert get_crc32(u'example\u5000'.encode('utf8')) == 1259721235 + assert get_crc32("example\u5000".encode()) == 1259721235 def test_negative_crc32(self): - assert get_crc32(b'1') == -2082672713 + assert get_crc32(b"1") == -2082672713 def test_crc32_range(self): - left, right = -2**31, 2**31 - 1 + left, right = -(2**31), 2**31 - 1 for x in range(10000): - bytestr = hashlib.md5(str(x).encode('ascii')).hexdigest() + bytestr = hashlib.md5(str(x).encode("ascii")).hexdigest() assert left <= get_crc32(bytestr) <= right - for x in [left, left + 1, right - 1, right, right + 1, - 2**32 - 2, 2**32 - 1]: + for x in [left, left + 1, right - 1, right, right + 1, 2**32 - 2, 2**32 - 1]: assert left <= to_signed32(x) <= right -class TestChunks(object): - +class TestChunks: def test_empty_list(self): assert list(chunks([], 1)) == [] @@ -41,44 +38,47 @@ def test_multiple_length(self): assert list(chunks([1, 2, 3, 4, 5, 6], 2)) == [[1, 2], [3, 4], [5, 6]] def test_non_multiple_length(self): - assert list(chunks([1, 2, 3, 4, 5, 6, 7, 8], 3)) == [[1, 2, 3], [4, 5, 6], [7, 8]] - + assert list(chunks([1, 2, 3, 4, 5, 6, 7, 8], 3)) == [ + [1, 2, 3], + [4, 5, 6], + [7, 8], + ] -class TestLoadObject(object): +class TestLoadObject: def test_load_class(self): - obj = load_object('tests.mocks.load_objects.MockClass') + obj = load_object("tests.mocks.load_objects.MockClass") assert obj.val == 10 def test_load_instance(self): - obj = load_object('tests.mocks.load_objects.mock_instance') + obj = load_object("tests.mocks.load_objects.mock_instance") assert obj.val == 5 def test_load_variable(self): - obj = load_object('tests.mocks.load_objects.mock_variable') - assert obj == 'test' + obj = load_object("tests.mocks.load_objects.mock_variable") + assert obj == "test" def test_load_function(self): - obj = load_object('tests.mocks.load_objects.mock_function') + obj = load_object("tests.mocks.load_objects.mock_function") assert obj() == 2 def test_value_error(self): with pytest.raises(ValueError) as info: - load_object('frontera') + load_object("frontera") assert str(info.value) == "Error loading object 'frontera': not a full path" def test_import_error(self): with pytest.raises(ImportError) as info: - load_object('frontera.non_existent_module.object') - if six.PY2: - assert str(info.value) == ("Error loading object 'frontera.non_existent_module.object'" - ": No module named non_existent_module") - else: - assert str(info.value) == ("Error loading object 'frontera.non_existent_module.object'" - ": No module named 'frontera.non_existent_module'") + load_object("frontera.non_existent_module.object") + assert str(info.value) == ( + "Error loading object 'frontera.non_existent_module.object'" + ": No module named 'frontera.non_existent_module'" + ) def test_name_error(self): with pytest.raises(NameError) as info: - load_object('tests.mocks.load_objects.non_existent_object') - assert str(info.value) == ("Module 'tests.mocks.load_objects' doesn't define" - " any object named 'non_existent_object'") + load_object("tests.mocks.load_objects.non_existent_object") + assert str(info.value) == ( + "Module 'tests.mocks.load_objects' doesn't define" + " any object named 'non_existent_object'" + ) diff --git a/tests/test_utils_url.py b/tests/test_utils_url.py index 33582fff3..da4ca6387 100644 --- a/tests/test_utils_url.py +++ b/tests/test_utils_url.py @@ -1,23 +1,39 @@ -from __future__ import absolute_import +import pytest + +pytest.importorskip("tldextract") + import unittest -from frontera.utils.url import parse_url, parse_domain_from_url, \ - parse_domain_from_url_fast +from frontera.utils.url import ( + parse_domain_from_url, + parse_domain_from_url_fast, + parse_url, +) -simple_url = 'http://www.example.com' -complete_url = 'http://username:password@www.example.com:80/some/page/do?a=1&b=2&c=3#frag' +simple_url = "http://www.example.com" +complete_url = ( + "http://username:password@www.example.com:80/some/page/do?a=1&b=2&c=3#frag" +) class TestParseUrl(unittest.TestCase): - def test_simple_url(self): - self.assertEqual(parse_url(simple_url), - ('http', 'www.example.com', '', '', '', '')) + self.assertEqual( + parse_url(simple_url), ("http", "www.example.com", "", "", "", "") + ) def test_complete_url(self): - self.assertEqual(parse_url(complete_url), - ('http', 'username:password@www.example.com:80', - '/some/page/do', '', 'a=1&b=2&c=3', 'frag')) + self.assertEqual( + parse_url(complete_url), + ( + "http", + "username:password@www.example.com:80", + "/some/page/do", + "", + "a=1&b=2&c=3", + "frag", + ), + ) def test_already_parsed(self): result = parse_url(simple_url) @@ -25,34 +41,53 @@ def test_already_parsed(self): class TestParseDomainFromUrl(unittest.TestCase): - def test_simple_url(self): - self.assertEqual(parse_domain_from_url(simple_url), - ('www.example.com', 'example.com', 'http', 'example', 'com', 'www')) + self.assertEqual( + parse_domain_from_url(simple_url), + ("www.example.com", "example.com", "http", "example", "com", "www"), + ) def test_complete_url(self): - self.assertEqual(parse_domain_from_url(complete_url), - ('www.example.com', 'example.com', 'http', 'example', 'com', 'www')) + self.assertEqual( + parse_domain_from_url(complete_url), + ("www.example.com", "example.com", "http", "example", "com", "www"), + ) def test_missing_tld(self): - self.assertEqual(parse_domain_from_url('http://www.example'), - ('www.example', 'example', 'http', 'example', '', 'www')) + self.assertEqual( + parse_domain_from_url("http://www.example"), + ("www.example", "example", "http", "example", "", "www"), + ) def test_missing_subdomain(self): - self.assertEqual(parse_domain_from_url('https://example.com'), - ('example.com', 'example.com', 'https', 'example', 'com', '')) + self.assertEqual( + parse_domain_from_url("https://example.com"), + ("example.com", "example.com", "https", "example", "com", ""), + ) def test_missing_scheme(self): - self.assertEqual(parse_domain_from_url('www.example.com'), - ('www.example.com', 'example.com', '', 'example', 'com', 'www')) + self.assertEqual( + parse_domain_from_url("www.example.com"), + ("www.example.com", "example.com", "", "example", "com", "www"), + ) class TestParseDomainFromUrlFast(unittest.TestCase): - def test_simple_url(self): - self.assertEqual(parse_domain_from_url_fast(simple_url), - ('www.example.com', 'www.example.com', 'http', '', '', '')) + self.assertEqual( + parse_domain_from_url_fast(simple_url), + ("www.example.com", "www.example.com", "http", "", "", ""), + ) def test_complete_url(self): - self.assertEqual(parse_domain_from_url_fast(complete_url), - ('username:password@www.example.com:80', 'www.example.com', 'http', '', '', '')) + self.assertEqual( + parse_domain_from_url_fast(complete_url), + ( + "username:password@www.example.com:80", + "www.example.com", + "http", + "", + "", + "", + ), + ) diff --git a/tests/test_worker_db.py b/tests/test_worker_db.py index 05b91d0c2..94256db4b 100644 --- a/tests/test_worker_db.py +++ b/tests/test_worker_db.py @@ -1,24 +1,36 @@ -from frontera.core.models import Request, Response -from frontera.worker.db import DBWorker -from frontera.settings import Settings -from frontera.core.components import States +import pytest +pytest.importorskip("msgpack") +pytest.importorskip("twisted") -r1 = Request('http://www.example.com/', meta={b'fingerprint': b'1', b'state': States.DEFAULT, b'jid': 0}) -r2 = Request('http://www.scrapy.org/', meta={b'fingerprint': b'2', b'state': States.DEFAULT, b'jid': 0}) -r3 = Request('https://www.dmoz.org', meta={b'fingerprint': b'3', b'state': States.DEFAULT, b'jid': 0}) +from frontera.core.components import States +from frontera.core.models import Request, Response +from frontera.settings import Settings +from frontera.worker.db import DBWorker +r1 = Request( + "http://www.example.com/", + meta={b"fingerprint": b"1", b"state": States.DEFAULT, b"jid": 0}, +) +r2 = Request( + "http://www.scrapy.org/", + meta={b"fingerprint": b"2", b"state": States.DEFAULT, b"jid": 0}, +) +r3 = Request( + "https://www.dmoz.org", + meta={b"fingerprint": b"3", b"state": States.DEFAULT, b"jid": 0}, +) -class TestDBWorker(object): +class TestDBWorker: def dbw_setup(self, distributed=False): settings = Settings() settings.MAX_NEXT_REQUESTS = 64 - settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' + settings.MESSAGE_BUS = "tests.mocks.message_bus.FakeMessageBus" if distributed: - settings.BACKEND = 'tests.mocks.components.FakeDistributedBackend' + settings.BACKEND = "tests.mocks.components.FakeDistributedBackend" else: - settings.BACKEND = 'tests.mocks.components.FakeBackend' + settings.BACKEND = "tests.mocks.components.FakeBackend" return DBWorker(settings, True, True, False) def test_add_seeds(self): @@ -26,7 +38,7 @@ def test_add_seeds(self): msg = dbw._encoder.encode_add_seeds([r1, r2, r3]) dbw.spider_log_consumer.put_messages([msg]) dbw.consume_incoming() - assert set([r.url for r in dbw._backend.seeds]) == set([r.url for r in [r1, r2, r3]]) + assert {r.url for r in dbw._backend.seeds} == {r.url for r in [r1, r2, r3]} def test_page_crawled(self): dbw = self.dbw_setup() @@ -34,22 +46,22 @@ def test_page_crawled(self): msg = dbw._encoder.encode_page_crawled(resp) dbw.spider_log_consumer.put_messages([msg]) dbw.consume_incoming() - assert set([r.url for r in dbw._backend.responses]) == set([r1.url]) + assert {r.url for r in dbw._backend.responses} == {r1.url} def test_links_extracted(self): dbw = self.dbw_setup() msg = dbw._encoder.encode_links_extracted(r1, [r2, r3]) dbw.spider_log_consumer.put_messages([msg]) dbw.consume_incoming() - assert set([r.url for r in dbw._backend.links]) == set([r2.url, r3.url]) + assert {r.url for r in dbw._backend.links} == {r2.url, r3.url} def test_request_error(self): dbw = self.dbw_setup() - msg = dbw._encoder.encode_request_error(r1, 'error') + msg = dbw._encoder.encode_request_error(r1, "error") dbw.spider_log_consumer.put_messages([msg]) dbw.consume_incoming() assert dbw._backend.errors[0][0].url == r1.url - assert dbw._backend.errors[0][1] == 'error' + assert dbw._backend.errors[0][1] == "error" def test_scoring(self): dbw = self.dbw_setup(True) @@ -61,15 +73,16 @@ def test_scoring(self): msg2 = dbw._encoder.encode_update_score(r3, 0.6, True) dbw.scoring_log_consumer.put_messages([msg1, msg2]) dbw.consume_scoring() - assert set([r.url for r in dbw._backend.queue.requests]) == set([r1.url, r3.url]) + assert {r.url for r in dbw._backend.queue.requests} == {r1.url, r3.url} assert dbw.new_batch() == 2 def test_new_batch(self): dbw = self.dbw_setup(True) dbw._backend.queue.put_requests([r1, r2, r3]) assert dbw.new_batch() == 3 - assert set(dbw.spider_feed_producer.messages) == \ - set([dbw._encoder.encode_request(r) for r in [r1, r2, r3]]) + assert set(dbw.spider_feed_producer.messages) == { + dbw._encoder.encode_request(r) for r in [r1, r2, r3] + } def test_offset(self): dbw = self.dbw_setup(True) diff --git a/tests/test_worker_strategy.py b/tests/test_worker_strategy.py index 7a2acd873..cc4ccf094 100644 --- a/tests/test_worker_strategy.py +++ b/tests/test_worker_strategy.py @@ -1,22 +1,25 @@ -from frontera.worker.strategy import StrategyWorker -from frontera.worker.strategies.bfs import CrawlingStrategy -from frontera.settings import Settings -from frontera.core.models import Request, Response -from frontera.core.components import States +import pytest +pytest.importorskip("msgpack") +pytest.importorskip("twisted") -r1 = Request('http://www.example.com/', meta={b'fingerprint': b'1', b'jid': 0}) -r2 = Request('http://www.scrapy.org/', meta={b'fingerprint': b'2', b'jid': 0}) -r3 = Request('https://www.dmoz.org', meta={b'fingerprint': b'3', b'jid': 0}) -r4 = Request('http://www.test.com/some/page', meta={b'fingerprint': b'4', b'jid': 0}) +from frontera.core.components import States +from frontera.core.models import Request, Response +from frontera.settings import Settings +from frontera.worker.strategies.bfs import CrawlingStrategy +from frontera.worker.strategy import StrategyWorker +r1 = Request("http://www.example.com/", meta={b"fingerprint": b"1", b"jid": 0}) +r2 = Request("http://www.scrapy.org/", meta={b"fingerprint": b"2", b"jid": 0}) +r3 = Request("https://www.dmoz.org", meta={b"fingerprint": b"3", b"jid": 0}) +r4 = Request("http://www.test.com/some/page", meta={b"fingerprint": b"4", b"jid": 0}) -class TestStrategyWorker(object): +class TestStrategyWorker: def sw_setup(self): settings = Settings() - settings.BACKEND = 'frontera.contrib.backends.sqlalchemy.Distributed' - settings.MESSAGE_BUS = 'tests.mocks.message_bus.FakeMessageBus' + settings.BACKEND = "frontera.contrib.backends.sqlalchemy.Distributed" + settings.MESSAGE_BUS = "tests.mocks.message_bus.FakeMessageBus" settings.SPIDER_LOG_CONSUMER_BATCH_SIZE = 100 return StrategyWorker(settings, CrawlingStrategy) @@ -24,20 +27,20 @@ def test_add_seeds(self): sw = self.sw_setup() msg = sw._encoder.encode_add_seeds([r1, r2, r3, r4]) sw.consumer.put_messages([msg]) - r2.meta[b'state'] = States.CRAWLED + r2.meta[b"state"] = States.CRAWLED sw.states.update_cache([r2]) sw.work() - r1.meta[b'state'] = States.QUEUED - r3.meta[b'state'] = States.QUEUED - r4.meta[b'state'] = States.QUEUED - assert set(sw.scoring_log_producer.messages) == \ - set([sw._encoder.encode_update_score(r, 1.0, True) - for r in [r1, r3, r4]]) + r1.meta[b"state"] = States.QUEUED + r3.meta[b"state"] = States.QUEUED + r4.meta[b"state"] = States.QUEUED + assert set(sw.scoring_log_producer.messages) == { + sw._encoder.encode_update_score(r, 1.0, True) for r in [r1, r3, r4] + } def test_page_crawled(self): sw = self.sw_setup() - r1.meta[b'jid'] = 1 + r1.meta[b"jid"] = 1 resp = Response(r1.url, request=r1) msg = sw._encoder.encode_page_crawled(resp) sw.consumer.put_messages([msg]) @@ -49,25 +52,29 @@ def test_page_crawled(self): sw.work() r1c = r1.copy() sw.states.set_states(r1c) - assert r1c.meta[b'state'] == States.CRAWLED + assert r1c.meta[b"state"] == States.CRAWLED def test_links_extracted(self): sw = self.sw_setup() sw.job_id = 0 - r1.meta[b'jid'] = 0 + r1.meta[b"jid"] = 0 msg = sw._encoder.encode_links_extracted(r1, [r3, r4]) sw.consumer.put_messages([msg]) sw.work() - r3.meta[b'state'] = States.QUEUED - r4.meta[b'state'] = States.QUEUED - assert set(sw.scoring_log_producer.messages) == \ - set(sw._encoder.encode_update_score(r, sw.strategy.get_score(r.url), True) for r in [r3, r4]) + r3.meta[b"state"] = States.QUEUED + r4.meta[b"state"] = States.QUEUED + assert set(sw.scoring_log_producer.messages) == { + sw._encoder.encode_update_score(r, sw.strategy.get_score(r.url), True) + for r in [r3, r4] + } def test_request_error(self): sw = self.sw_setup() - msg = sw._encoder.encode_request_error(r4, 'error') + msg = sw._encoder.encode_request_error(r4, "error") sw.consumer.put_messages([msg]) sw.work() - r4.meta[b'state'] = States.ERROR - assert sw.scoring_log_producer.messages.pop() == \ - sw._encoder.encode_update_score(r4, 0.0, False) + r4.meta[b"state"] = States.ERROR + assert ( + sw.scoring_log_producer.messages.pop() + == sw._encoder.encode_update_score(r4, 0.0, False) + ) diff --git a/tests/utils.py b/tests/utils.py index bea9fe690..afb34e71e 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,61 +1,57 @@ -import six import logging import logging.config +from io import StringIO colors = { - 'bold_yellow': '\x1b[01;33m', - 'green': '\x1b[32m', - 'red': '\x1b[31m', - 'reset': '\x1b[0m', - 'white': '\x1b[37m', + "bold_yellow": "\x1b[01;33m", + "green": "\x1b[32m", + "red": "\x1b[31m", + "reset": "\x1b[0m", + "white": "\x1b[37m", } DEFAULT_LOGGING = { - 'version': 1, - 'disable_existing_loggers': False, - 'formatters': { - 'message': { - 'format': '%(message)s' + "version": 1, + "disable_existing_loggers": False, + "formatters": {"message": {"format": "%(message)s"}}, + "handlers": { + "console": { + "level": "DEBUG", + "class": "logging.StreamHandler", + "formatter": "message", } }, - 'handlers': { - 'console': { - 'level': 'DEBUG', - 'class': 'logging.StreamHandler', - 'formatter': 'message', - } - }, - 'loggers': { - 'frontera': { - 'handlers': ['console'], - 'level': 'DEBUG', + "loggers": { + "frontera": { + "handlers": ["console"], + "level": "DEBUG", }, - } + }, } -class LoggingCaptureMixin(object): +class LoggingCaptureMixin: """ Capture the output from the 'frontera' logger and store it on the class's logger_output attribute. """ def setUp(self): - self.logger = logging.getLogger('frontera') + self.logger = logging.getLogger("frontera") self.old_stream = self.logger.handlers[0].stream - self.logger_output = six.StringIO() + self.logger_output = StringIO() self.logger.handlers[0].stream = self.logger_output def tearDown(self): self.logger.handlers[0].stream = self.old_stream -class SetupDefaultLoggingMixin(object): +class SetupDefaultLoggingMixin: @classmethod def setUpClass(cls): - super(SetupDefaultLoggingMixin, cls).setUpClass() + super().setUpClass() logging.config.dictConfig(DEFAULT_LOGGING) @classmethod def tearDownClass(cls): - super(SetupDefaultLoggingMixin, cls).tearDownClass() + super().tearDownClass() diff --git a/tox.ini b/tox.ini index 9e47f6eac..6a0e9bb96 100644 --- a/tox.ini +++ b/tox.ini @@ -4,31 +4,122 @@ # and then run "tox" from this directory. [tox] +envlist = pre-commit,min,min-hbase,min-kafka,min-logging,min-s3,min-scrapy,min-sql,min-zeromq,min-all,py39,py310,py311,py312,py313,hbase,kafka,logging,s3,scrapy,sql,zeromq,all minversion = 1.8 -envlist = py27,flake8 -skip_missing_interpreters = True [testenv] -# do not load /etc/boto.cfg with Python 3 incompatible plugin -# https://github.com/travis-ci/travis-ci/issues/5246#issuecomment-166460882 -setenv = - BOTO_CONFIG = /tmp/nowhere deps = - -r{toxinidir}/requirements.txt - -r{toxinidir}/requirements/tests.txt + flaky + pytest + pytest-cov commands = - py.test --cov-report=term --cov=frontera -s -v {posargs:tests} + pytest --cov-report=term --cov=frontera -s -v {posargs:tests} -[testenv:flake8] -changedir = {toxinidir} -deps = flake8 -commands = flake8 setup.py frontera +[testenv:hbase] +extras = hbase -# Options for flake8 -[flake8] -ignore = E265,E501,F401,W391,W292,E226 -exclude = frontera/_version.py,versioneer.py,docs/source/conf.py,frontera/contrib/backends/opic/discovery.py +[testenv:kafka] +extras = kafka +# https://github.com/dpkp/kafka-python/issues/2412 +basepython = python3.11 +[testenv:logging] +extras = logging + +[testenv:s3] +extras = s3 + +[testenv:scrapy] +extras = scrapy + +[testenv:sql] +extras = sql + +[testenv:zeromq] +extras = zeromq + +[testenv:all] +extras = + hbase + kafka + logging + s3 + scrapy + sql + zeromq + +[testenv:min] +basepython = python3.9 +deps = + {[testenv]deps} + cityhash==0.4.7 + six==1.8.0 + w3lib==1.17.0 + +[testenv:min-hbase] +basepython = {[testenv:min]basepython} +deps = + {[testenv:min]deps} + cachetools==0.4.0 + happybase==1.2.0 + msgpack-python==0.4 + setuptools==50.3.1 + +[testenv:min-kafka] +basepython = {[testenv:min]basepython} +deps = + {[testenv:min]deps} + kafka-python==1.4.3 + twisted==20.3.0 + +[testenv:min-logging] +basepython = {[testenv:min]basepython} +deps = + {[testenv:min]deps} + colorlog==2.4.0 + python-json-logger==0.1.5 + +[testenv:min-s3] +basepython = {[testenv:min]basepython} +deps = + {[testenv:min]deps} + boto==2.49.0 + +[testenv:min-scrapy] +basepython = {[testenv:min]basepython} +deps = + {[testenv:min]deps} + scrapy==2.7.0 + +[testenv:min-sql] +basepython = {[testenv:min]basepython} +deps = + {[testenv:min]deps} + cachetools==0.4.0 + SQLAlchemy==1.0.0 + +[testenv:min-zeromq] +basepython = {[testenv:min]basepython} +deps = + {[testenv:min]deps} + pyzmq==19.0.2 + msgpack-python==0.4 + +[testenv:min-all] +basepython = {[testenv:min]basepython} +deps = + {[testenv:min]deps} + {[testenv:min-hbase]deps} + {[testenv:min-kafka]deps} + {[testenv:min-logging]deps} + {[testenv:min-s3]deps} + {[testenv:min-scrapy]deps} + {[testenv:min-sql]deps} + {[testenv:min-zeromq]deps} + +[testenv:pre-commit] +deps = pre-commit +commands = pre-commit run --all # Options for pytest [pytest] diff --git a/versioneer.py b/versioneer.py deleted file mode 100644 index 889601c94..000000000 --- a/versioneer.py +++ /dev/null @@ -1,950 +0,0 @@ - -# Version: 0.12 - -""" -The Versioneer -============== - -* like a rocketeer, but for versions! -* https://github.com/warner/python-versioneer -* Brian Warner (modified by Florian Wilhelm and Felix Wick) -* License: Public Domain -* Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, and pypy - -[![Build Status](https://travis-ci.org/warner/python-versioneer.png?branch=master)](https://travis-ci.org/warner/python-versioneer) - -This is a tool for managing a recorded version number in distutils-based -python projects. The goal is to remove the tedious and error-prone "update -the embedded version string" step from your release process. Making a new -release should be as easy as recording a new tag in your version-control -system, and maybe making new tarballs. - - -## Quick Install - -* `pip install versioneer` to somewhere to your $PATH -* run `versioneer-installer` in your source tree: this installs `versioneer.py` -* follow the instructions below (also in the `versioneer.py` docstring) - -## Version Identifiers - -Source trees come from a variety of places: - -* a version-control system checkout (mostly used by developers) -* a nightly tarball, produced by build automation -* a snapshot tarball, produced by a web-based VCS browser, like github's - "tarball from tag" feature -* a release tarball, produced by "setup.py sdist", distributed through PyPI - -Within each source tree, the version identifier (either a string or a number, -this tool is format-agnostic) can come from a variety of places: - -* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows - about recent "tags" and an absolute revision-id -* the name of the directory into which the tarball was unpacked -* an expanded VCS keyword ($Id$, etc) -* a `_version.py` created by some earlier build step - -For released software, the version identifier is closely related to a VCS -tag. Some projects use tag names that include more than just the version -string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool -needs to strip the tag prefix to extract the version identifier. For -unreleased software (between tags), the version identifier should provide -enough information to help developers recreate the same tree, while also -giving them an idea of roughly how old the tree is (after version 1.2, before -version 1.3). Many VCS systems can report a description that captures this, -for example 'git describe --tags --dirty --always' reports things like -"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the -0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has -uncommitted changes. - -The version identifier is used for multiple purposes: - -* to allow the module to self-identify its version: `myproject.__version__` -* to choose a name and prefix for a 'setup.py sdist' tarball - -## Theory of Operation - -Versioneer works by adding a special `_version.py` file into your source -tree, where your `__init__.py` can import it. This `_version.py` knows how to -dynamically ask the VCS tool for version information at import time. However, -when you use "setup.py build" or "setup.py sdist", `_version.py` in the new -copy is replaced by a small static file that contains just the generated -version data. - -`_version.py` also contains `$Revision$` markers, and the installation -process marks `_version.py` to have this marker rewritten with a tag name -during the "git archive" command. As a result, generated tarballs will -contain enough information to get the proper version. - - -## Installation - -First, decide on values for the following configuration variables: - -* `VCS`: the version control system you use. Currently accepts "git". - -* `versionfile_source`: - - A project-relative pathname into which the generated version strings should - be written. This is usually a `_version.py` next to your project's main - `__init__.py` file, so it can be imported at runtime. If your project uses - `src/myproject/__init__.py`, this should be `src/myproject/_version.py`. - This file should be checked in to your VCS as usual: the copy created below - by `setup.py versioneer` will include code that parses expanded VCS - keywords in generated tarballs. The 'build' and 'sdist' commands will - replace it with a copy that has just the calculated version string. - - This must be set even if your project does not have any modules (and will - therefore never import `_version.py`), since "setup.py sdist" -based trees - still need somewhere to record the pre-calculated version strings. Anywhere - in the source tree should do. If there is a `__init__.py` next to your - `_version.py`, the `setup.py versioneer` command (described below) will - append some `__version__`-setting assignments, if they aren't already - present. - -* `versionfile_build`: - - Like `versionfile_source`, but relative to the build directory instead of - the source directory. These will differ when your setup.py uses - 'package_dir='. If you have `package_dir={'myproject': 'src/myproject'}`, - then you will probably have `versionfile_build='myproject/_version.py'` and - `versionfile_source='src/myproject/_version.py'`. - - If this is set to None, then `setup.py build` will not attempt to rewrite - any `_version.py` in the built tree. If your project does not have any - libraries (e.g. if it only builds a script), then you should use - `versionfile_build = None` and override `distutils.command.build_scripts` - to explicitly insert a copy of `versioneer.get_version()` into your - generated script. - -* `tag_prefix`: - - a string, like 'PROJECTNAME-', which appears at the start of all VCS tags. - If your tags look like 'myproject-1.2.0', then you should use - tag_prefix='myproject-'. If you use unprefixed tags like '1.2.0', this - should be an empty string. - -* `parentdir_prefix`: - - a string, frequently the same as tag_prefix, which appears at the start of - all unpacked tarball filenames. If your tarball unpacks into - 'myproject-1.2.0', this should be 'myproject-'. - -This tool provides one script, named `versioneer-installer`. That script does -one thing: write a copy of `versioneer.py` into the current directory. - -To versioneer-enable your project: - -* 1: Run `versioneer-installer` to copy `versioneer.py` into the top of your - source tree. - -* 2: add the following lines to the top of your `setup.py`, with the - configuration values you decided earlier: - - import versioneer - versioneer.VCS = 'git' - versioneer.versionfile_source = 'src/myproject/_version.py' - versioneer.versionfile_build = 'myproject/_version.py' - versioneer.tag_prefix = '' # tags are like 1.2.0 - versioneer.parentdir_prefix = 'myproject-' # dirname like 'myproject-1.2.0' - -* 3: add the following arguments to the setup() call in your setup.py: - - version=versioneer.get_version(), - cmdclass=versioneer.get_cmdclass(), - -* 4: now run `setup.py versioneer`, which will create `_version.py`, and will - modify your `__init__.py` (if one exists next to `_version.py`) to define - `__version__` (by calling a function from `_version.py`). It will also - modify your `MANIFEST.in` to include both `versioneer.py` and the generated - `_version.py` in sdist tarballs. - -* 5: commit these changes to your VCS. To make sure you won't forget, - `setup.py versioneer` will mark everything it touched for addition. - -## Post-Installation Usage - -Once established, all uses of your tree from a VCS checkout should get the -current version string. All generated tarballs should include an embedded -version string (so users who unpack them will not need a VCS tool installed). - -If you distribute your project through PyPI, then the release process should -boil down to two steps: - -* 1: git tag 1.0 -* 2: python setup.py register sdist upload - -If you distribute it through github (i.e. users use github to generate -tarballs with `git archive`), the process is: - -* 1: git tag 1.0 -* 2: git push; git push --tags - -Currently, all version strings must be based upon a tag. Versioneer will -report "unknown" until your tree has at least one tag in its history. This -restriction will be fixed eventually (see issue #12). - -## Version-String Flavors - -Code which uses Versioneer can learn about its version string at runtime by -importing `_version` from your main `__init__.py` file and running the -`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can -import the top-level `versioneer.py` and run `get_versions()`. - -Both functions return a dictionary with different keys for different flavors -of the version string: - -* `['version']`: condensed tag+distance+shortid+dirty identifier. For git, - this uses the output of `git describe --tags --dirty --always` but strips - the tag_prefix. For example "0.11-2-g1076c97-dirty" indicates that the tree - is like the "1076c97" commit but has uncommitted changes ("-dirty"), and - that this commit is two revisions ("-2-") beyond the "0.11" tag. For - released software (exactly equal to a known tag), the identifier will only - contain the stripped tag, e.g. "0.11". - -* `['full']`: detailed revision identifier. For Git, this is the full SHA1 - commit id, followed by "-dirty" if the tree contains uncommitted changes, - e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac-dirty". - -Some variants are more useful than others. Including `full` in a bug report -should allow developers to reconstruct the exact code being tested (or -indicate the presence of local changes that should be shared with the -developers). `version` is suitable for display in an "about" box or a CLI -`--version` output: it can be easily compared against release notes and lists -of bugs fixed in various releases. - -In the future, this will also include a -[PEP-0440](http://legacy.python.org/dev/peps/pep-0440/) -compatible flavor -(e.g. `1.2.post0.dev123`). This loses a lot of information (and has no room -for a hash-based revision id), but is safe to use in a `setup.py` -"`version=`" argument. It also enables tools like *pip* to compare version -strings and evaluate compatibility constraint declarations. - -The `setup.py versioneer` command adds the following text to your -`__init__.py` to place a basic version in `YOURPROJECT.__version__`: - - from ._version import get_versions - __version__ = get_versions()['version'] - del get_versions - -## Updating Versioneer - -To upgrade your project to a new release of Versioneer, do the following: - -* install the new Versioneer (`pip install -U versioneer` or equivalent) -* re-run `versioneer-installer` in your source tree to replace your copy of - `versioneer.py` -* edit `setup.py`, if necessary, to include any new configuration settings - indicated by the release notes -* re-run `setup.py versioneer` to replace `SRC/_version.py` -* commit any changed files - -### Upgrading from 0.10 to 0.11 - -You must add a `versioneer.VCS = "git"` to your `setup.py` before re-running -`setup.py versioneer`. This will enable the use of additional version-control -systems (SVN, etc) in the future. - -### Upgrading from 0.11 to 0.12 - -Nothing special. - -## Future Directions - -This tool is designed to make it easily extended to other version-control -systems: all VCS-specific components are in separate directories like -src/git/ . The top-level `versioneer.py` script is assembled from these -components by running make-versioneer.py . In the future, make-versioneer.py -will take a VCS name as an argument, and will construct a version of -`versioneer.py` that is specific to the given VCS. It might also take the -configuration arguments that are currently provided manually during -installation by editing setup.py . Alternatively, it might go the other -direction and include code from all supported VCS systems, reducing the -number of intermediate scripts. - - -## License - -To make Versioneer easier to embed, all its code is hereby released into the -public domain. The `_version.py` that it creates is also in the public -domain. - -""" - -import os, sys, re, subprocess, errno -from distutils.core import Command -from distutils.command.sdist import sdist as _sdist -from distutils.command.build import build as _build - -# these configuration settings will be overridden by setup.py after it -# imports us -versionfile_source = None -versionfile_build = None -tag_prefix = None -parentdir_prefix = None -VCS = None - -# these dictionaries contain VCS-specific tools -LONG_VERSION_PY = {} - -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): - assert isinstance(commands, list) - p = None - for c in commands: - try: - # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) - break - except EnvironmentError: - e = sys.exc_info()[1] - if e.errno == errno.ENOENT: - continue - if verbose: - print("unable to run %s" % args[0]) - print(e) - return None - else: - if verbose: - print("unable to find command, tried %s" % (commands,)) - return None - stdout = p.communicate()[0].strip() - if sys.version >= '3': - stdout = stdout.decode() - if p.returncode != 0: - if verbose: - print("unable to run %s (error)" % args[0]) - return None - return stdout - -LONG_VERSION_PY['git'] = ''' -# This file helps to compute a version number in source trees obtained from -# git-archive tarball (such as those provided by githubs download-from-tag -# feature). Distribution tarballs (built by setup.py sdist) and build -# directories (produced by setup.py build) will contain a much shorter file -# that just contains the computed version number. - -# This file is released into the public domain. Generated by -# versioneer-0.12 (https://github.com/warner/python-versioneer) - -# these strings will be replaced by git during git-archive -git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" -git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" - -# these strings are filled in when 'setup.py versioneer' creates _version.py -tag_prefix = "%(TAG_PREFIX)s" -parentdir_prefix = "%(PARENTDIR_PREFIX)s" -versionfile_source = "%(VERSIONFILE_SOURCE)s" - -import os, sys, re, subprocess, errno - -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): - assert isinstance(commands, list) - p = None - for c in commands: - try: - # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) - break - except EnvironmentError: - e = sys.exc_info()[1] - if e.errno == errno.ENOENT: - continue - if verbose: - print("unable to run %%s" %% args[0]) - print(e) - return None - else: - if verbose: - print("unable to find command, tried %%s" %% (commands,)) - return None - stdout = p.communicate()[0].strip() - if sys.version >= '3': - stdout = stdout.decode() - if p.returncode != 0: - if verbose: - print("unable to run %%s (error)" %% args[0]) - return None - return stdout - - -def versions_from_parentdir(parentdir_prefix, root, verbose=False): - # Source tarballs conventionally unpack into a directory that includes - # both the project name and a version string. - dirname = os.path.basename(root) - if not dirname.startswith(parentdir_prefix): - if verbose: - print("guessing rootdir is '%%s', but '%%s' doesn't start with prefix '%%s'" %% - (root, dirname, parentdir_prefix)) - return None - return {"version": dirname[len(parentdir_prefix):], "full": ""} - -def git_get_keywords(versionfile_abs): - # the code embedded in _version.py can just fetch the value of these - # keywords. When used from setup.py, we don't want to import _version.py, - # so we do it with a regexp instead. This function is not used from - # _version.py. - keywords = {} - try: - f = open(versionfile_abs,"r") - for line in f.readlines(): - if line.strip().startswith("git_refnames ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["refnames"] = mo.group(1) - if line.strip().startswith("git_full ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["full"] = mo.group(1) - f.close() - except EnvironmentError: - pass - return keywords - -def git_versions_from_keywords(keywords, tag_prefix, verbose=False): - if not keywords: - return {} # keyword-finding function failed to find keywords - refnames = keywords["refnames"].strip() - if refnames.startswith("$Format"): - if verbose: - print("keywords are unexpanded, not using") - return {} # unexpanded, so not in an unpacked git-archive tarball - refs = set([r.strip() for r in refnames.strip("()").split(",")]) - # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of - # just "foo-1.0". If we see a "tag: " prefix, prefer those. - TAG = "tag: " - tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) - if not tags: - # Either we're using git < 1.8.3, or there really are no tags. We use - # a heuristic: assume all version tags have a digit. The old git %%d - # expansion behaves like git log --decorate=short and strips out the - # refs/heads/ and refs/tags/ prefixes that would let us distinguish - # between branches and tags. By ignoring refnames without digits, we - # filter out many common branch names like "release" and - # "stabilization", as well as "HEAD" and "master". - tags = set([r for r in refs if re.search(r'\d', r)]) - if verbose: - print("discarding '%%s', no digits" %% ",".join(refs-tags)) - if verbose: - print("likely tags: %%s" %% ",".join(sorted(tags))) - for ref in sorted(tags): - # sorting will prefer e.g. "2.0" over "2.0rc1" - if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] - if verbose: - print("picking %%s" %% r) - return { "version": r, - "full": keywords["full"].strip() } - # no suitable tags, so we use the full revision id - if verbose: - print("no suitable tags, using full revision id") - return { "version": keywords["full"].strip(), - "full": keywords["full"].strip() } - - -def git_versions_from_vcs(tag_prefix, root, verbose=False): - # this runs 'git' from the root of the source tree. This only gets called - # if the git-archive 'subst' keywords were *not* expanded, and - # _version.py hasn't already been rewritten with a short version string, - # meaning we're inside a checked out source tree. - - if not os.path.exists(os.path.join(root, ".git")): - if verbose: - print("no .git in %%s" %% root) - return {} - - GITS = ["git"] - if sys.platform == "win32": - GITS = ["git.cmd", "git.exe"] - stdout = run_command(GITS, ["describe", "--tags", "--dirty", "--always"], - cwd=root) - if stdout is None: - return {} - if not stdout.startswith(tag_prefix): - if verbose: - print("tag '%%s' doesn't start with prefix '%%s'" %% (stdout, tag_prefix)) - return {} - tag = stdout[len(tag_prefix):] - stdout = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) - if stdout is None: - return {} - full = stdout.strip() - if tag.endswith("-dirty"): - full += "-dirty" - return {"version": tag, "full": full} - - -def get_versions(default={"version": "unknown", "full": ""}, verbose=False): - # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have - # __file__, we can work backwards from there to the root. Some - # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which - # case we can only use expanded keywords. - - keywords = { "refnames": git_refnames, "full": git_full } - ver = git_versions_from_keywords(keywords, tag_prefix, verbose) - if ver: - return rep_by_pep440(ver) - - try: - root = os.path.abspath(__file__) - # versionfile_source is the relative path from the top of the source - # tree (where the .git directory might live) to this file. Invert - # this to find the root from __file__. - for i in range(len(versionfile_source.split('/'))): - root = os.path.dirname(root) - except NameError: - return default - - return rep_by_pep440( - git_versions_from_vcs(tag_prefix, root, verbose) - or versions_from_parentdir(parentdir_prefix, root, verbose) - or default) - - -def git2pep440(ver_str): - dash_count = ver_str.count('-') - if dash_count == 0: - return ver_str - elif dash_count == 1: - return ver_str.split('-')[0] + ".post.dev1.pre" - elif dash_count == 2: - tag, commits, _ = ver_str.split('-') - return ".post.dev".join([tag, commits]) - elif dash_count == 3: - tag, commits, _, _ = ver_str.split('-') - commits = str(int(commits) + 1) - return ".post.dev".join([tag, commits]) + ".pre" - else: - raise RuntimeError("Invalid version string") - - -def rep_by_pep440(ver): - if ver["full"]: # only if versions_from_parentdir was not used - ver["version"] = git2pep440(ver["version"]) - else: - ver["version"] = ver["version"].split('-')[0] - return ver -''' - -def git_get_keywords(versionfile_abs): - # the code embedded in _version.py can just fetch the value of these - # keywords. When used from setup.py, we don't want to import _version.py, - # so we do it with a regexp instead. This function is not used from - # _version.py. - keywords = {} - try: - f = open(versionfile_abs,"r") - for line in f.readlines(): - if line.strip().startswith("git_refnames ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["refnames"] = mo.group(1) - if line.strip().startswith("git_full ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["full"] = mo.group(1) - f.close() - except EnvironmentError: - pass - return keywords - -def git_versions_from_keywords(keywords, tag_prefix, verbose=False): - if not keywords: - return {} # keyword-finding function failed to find keywords - refnames = keywords["refnames"].strip() - if refnames.startswith("$Format"): - if verbose: - print("keywords are unexpanded, not using") - return {} # unexpanded, so not in an unpacked git-archive tarball - refs = set([r.strip() for r in refnames.strip("()").split(",")]) - # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of - # just "foo-1.0". If we see a "tag: " prefix, prefer those. - TAG = "tag: " - tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) - if not tags: - # Either we're using git < 1.8.3, or there really are no tags. We use - # a heuristic: assume all version tags have a digit. The old git %d - # expansion behaves like git log --decorate=short and strips out the - # refs/heads/ and refs/tags/ prefixes that would let us distinguish - # between branches and tags. By ignoring refnames without digits, we - # filter out many common branch names like "release" and - # "stabilization", as well as "HEAD" and "master". - tags = set([r for r in refs if re.search(r'\d', r)]) - if verbose: - print("discarding '%s', no digits" % ",".join(refs-tags)) - if verbose: - print("likely tags: %s" % ",".join(sorted(tags))) - for ref in sorted(tags): - # sorting will prefer e.g. "2.0" over "2.0rc1" - if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] - if verbose: - print("picking %s" % r) - return { "version": r, - "full": keywords["full"].strip() } - # no suitable tags, so we use the full revision id - if verbose: - print("no suitable tags, using full revision id") - return { "version": keywords["full"].strip(), - "full": keywords["full"].strip() } - - -def git_versions_from_vcs(tag_prefix, root, verbose=False): - # this runs 'git' from the root of the source tree. This only gets called - # if the git-archive 'subst' keywords were *not* expanded, and - # _version.py hasn't already been rewritten with a short version string, - # meaning we're inside a checked out source tree. - - if not os.path.exists(os.path.join(root, ".git")): - if verbose: - print("no .git in %s" % root) - return {} - - GITS = ["git"] - if sys.platform == "win32": - GITS = ["git.cmd", "git.exe"] - stdout = run_command(GITS, ["describe", "--tags", "--dirty", "--always"], - cwd=root) - if stdout is None: - return {} - if not stdout.startswith(tag_prefix): - if verbose: - print("tag '%s' doesn't start with prefix '%s'" % (stdout, tag_prefix)) - return {} - tag = stdout[len(tag_prefix):] - stdout = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) - if stdout is None: - return {} - full = stdout.strip() - if tag.endswith("-dirty"): - full += "-dirty" - return {"version": tag, "full": full} - - -def do_vcs_install(manifest_in, versionfile_source, ipy): - GITS = ["git"] - if sys.platform == "win32": - GITS = ["git.cmd", "git.exe"] - files = [manifest_in, versionfile_source] - if ipy: - files.append(ipy) - try: - me = __file__ - if me.endswith(".pyc") or me.endswith(".pyo"): - me = os.path.splitext(me)[0] + ".py" - versioneer_file = os.path.relpath(me) - except NameError: - versioneer_file = "versioneer.py" - files.append(versioneer_file) - present = False - try: - f = open(".gitattributes", "r") - for line in f.readlines(): - if line.strip().startswith(versionfile_source): - if "export-subst" in line.strip().split()[1:]: - present = True - f.close() - except EnvironmentError: - pass - if not present: - f = open(".gitattributes", "a+") - f.write("%s export-subst\n" % versionfile_source) - f.close() - files.append(".gitattributes") - run_command(GITS, ["add", "--"] + files) - -def versions_from_parentdir(parentdir_prefix, root, verbose=False): - # Source tarballs conventionally unpack into a directory that includes - # both the project name and a version string. - dirname = os.path.basename(root) - if not dirname.startswith(parentdir_prefix): - if verbose: - print("guessing rootdir is '%s', but '%s' doesn't start with prefix '%s'" % - (root, dirname, parentdir_prefix)) - return None - return {"version": dirname[len(parentdir_prefix):], "full": ""} - -SHORT_VERSION_PY = """ -# This file was generated by 'versioneer.py' (0.12) from -# revision-control system data, or from the parent directory name of an -# unpacked source archive. Distribution tarballs contain a pre-generated copy -# of this file. - -version_version = '%(version)s' -version_full = '%(full)s' -def get_versions(default={}, verbose=False): - return {'version': version_version, 'full': version_full} - -""" - -DEFAULT = {"version": "unknown", "full": "unknown"} - -def versions_from_file(filename): - versions = {} - try: - with open(filename) as f: - for line in f.readlines(): - mo = re.match("version_version = '([^']+)'", line) - if mo: - versions["version"] = mo.group(1) - mo = re.match("version_full = '([^']+)'", line) - if mo: - versions["full"] = mo.group(1) - except EnvironmentError: - return {} - - return versions - -def write_to_version_file(filename, versions): - with open(filename, "w") as f: - f.write(SHORT_VERSION_PY % versions) - - print("set %s to '%s'" % (filename, versions["version"])) - - -def get_root(): - try: - return os.path.dirname(os.path.abspath(__file__)) - except NameError: - return os.path.dirname(os.path.abspath(sys.argv[0])) - -def vcs_function(vcs, suffix): - return getattr(sys.modules[__name__], '%s_%s' % (vcs, suffix), None) - -def get_versions(default=DEFAULT, verbose=False): - # returns dict with two keys: 'version' and 'full' - assert versionfile_source is not None, "please set versioneer.versionfile_source" - assert tag_prefix is not None, "please set versioneer.tag_prefix" - assert parentdir_prefix is not None, "please set versioneer.parentdir_prefix" - assert VCS is not None, "please set versioneer.VCS" - - # I am in versioneer.py, which must live at the top of the source tree, - # which we use to compute the root directory. py2exe/bbfreeze/non-CPython - # don't have __file__, in which case we fall back to sys.argv[0] (which - # ought to be the setup.py script). We prefer __file__ since that's more - # robust in cases where setup.py was invoked in some weird way (e.g. pip) - root = get_root() - versionfile_abs = os.path.join(root, versionfile_source) - - # extract version from first of _version.py, VCS command (e.g. 'git - # describe'), parentdir. This is meant to work for developers using a - # source checkout, for users of a tarball created by 'setup.py sdist', - # and for users of a tarball/zipball created by 'git archive' or github's - # download-from-tag feature or the equivalent in other VCSes. - - get_keywords_f = vcs_function(VCS, "get_keywords") - versions_from_keywords_f = vcs_function(VCS, "versions_from_keywords") - if get_keywords_f and versions_from_keywords_f: - vcs_keywords = get_keywords_f(versionfile_abs) - ver = versions_from_keywords_f(vcs_keywords, tag_prefix) - if ver: - if verbose: print("got version from expanded keyword %s" % ver) - return rep_by_pep440(ver) - - ver = versions_from_file(versionfile_abs) - if ver: - if verbose: print("got version from file %s %s" % (versionfile_abs,ver)) - return rep_by_pep440(ver) - - versions_from_vcs_f = vcs_function(VCS, "versions_from_vcs") - if versions_from_vcs_f: - ver = versions_from_vcs_f(tag_prefix, root, verbose) - if ver: - if verbose: print("got version from VCS %s" % ver) - return rep_by_pep440(ver) - - ver = versions_from_parentdir(parentdir_prefix, root, verbose) - if ver: - if verbose: print("got version from parentdir %s" % ver) - return rep_by_pep440(ver) - - if verbose: print("got version from default %s" % default) - return rep_by_pep440(default) - -def get_version(verbose=False): - return get_versions(verbose=verbose)["version"] - -class cmd_version(Command): - description = "report generated version string" - user_options = [] - boolean_options = [] - def initialize_options(self): - pass - def finalize_options(self): - pass - def run(self): - ver = get_version(verbose=True) - print("Version is currently: %s" % ver) - - -class cmd_build(_build): - def run(self): - versions = get_versions(verbose=True) - _build.run(self) - # now locate _version.py in the new build/ directory and replace it - # with an updated value - if versionfile_build: - target_versionfile = os.path.join(self.build_lib, versionfile_build) - print("UPDATING %s" % target_versionfile) - os.unlink(target_versionfile) - with open(target_versionfile, "w") as f: - f.write(SHORT_VERSION_PY % versions) - -if 'cx_Freeze' in sys.modules: # cx_freeze enabled? - from cx_Freeze.dist import build_exe as _build_exe - - class cmd_build_exe(_build_exe): - def run(self): - versions = get_versions(verbose=True) - target_versionfile = versionfile_source - print("UPDATING %s" % target_versionfile) - os.unlink(target_versionfile) - with open(target_versionfile, "w") as f: - f.write(SHORT_VERSION_PY % versions) - - _build_exe.run(self) - os.unlink(target_versionfile) - with open(versionfile_source, "w") as f: - assert VCS is not None, "please set versioneer.VCS" - LONG = LONG_VERSION_PY[VCS] - f.write(LONG % {"DOLLAR": "$", - "TAG_PREFIX": tag_prefix, - "PARENTDIR_PREFIX": parentdir_prefix, - "VERSIONFILE_SOURCE": versionfile_source, - }) - -class cmd_sdist(_sdist): - def run(self): - versions = get_versions(verbose=True) - self._versioneer_generated_versions = versions - # unless we update this, the command will keep using the old version - self.distribution.metadata.version = versions["version"] - return _sdist.run(self) - - def make_release_tree(self, base_dir, files): - _sdist.make_release_tree(self, base_dir, files) - # now locate _version.py in the new base_dir directory (remembering - # that it may be a hardlink) and replace it with an updated value - target_versionfile = os.path.join(base_dir, versionfile_source) - print("UPDATING %s" % target_versionfile) - os.unlink(target_versionfile) - with open(target_versionfile, "w") as f: - f.write(SHORT_VERSION_PY % self._versioneer_generated_versions) - -INIT_PY_SNIPPET = """ -from ._version import get_versions -__version__ = get_versions()['version'] -del get_versions -""" - -class cmd_update_files(Command): - description = "install/upgrade Versioneer files: __init__.py SRC/_version.py" - user_options = [] - boolean_options = [] - def initialize_options(self): - pass - def finalize_options(self): - pass - def run(self): - print(" creating %s" % versionfile_source) - with open(versionfile_source, "w") as f: - assert VCS is not None, "please set versioneer.VCS" - LONG = LONG_VERSION_PY[VCS] - f.write(LONG % {"DOLLAR": "$", - "TAG_PREFIX": tag_prefix, - "PARENTDIR_PREFIX": parentdir_prefix, - "VERSIONFILE_SOURCE": versionfile_source, - }) - - ipy = os.path.join(os.path.dirname(versionfile_source), "__init__.py") - if os.path.exists(ipy): - try: - with open(ipy, "r") as f: - old = f.read() - except EnvironmentError: - old = "" - if INIT_PY_SNIPPET not in old: - print(" appending to %s" % ipy) - with open(ipy, "a") as f: - f.write(INIT_PY_SNIPPET) - else: - print(" %s unmodified" % ipy) - else: - print(" %s doesn't exist, ok" % ipy) - ipy = None - - # Make sure both the top-level "versioneer.py" and versionfile_source - # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so - # they'll be copied into source distributions. Pip won't be able to - # install the package without this. - manifest_in = os.path.join(get_root(), "MANIFEST.in") - simple_includes = set() - try: - with open(manifest_in, "r") as f: - for line in f: - if line.startswith("include "): - for include in line.split()[1:]: - simple_includes.add(include) - except EnvironmentError: - pass - # That doesn't cover everything MANIFEST.in can do - # (http://docs.python.org/2/distutils/sourcedist.html#commands), so - # it might give some false negatives. Appending redundant 'include' - # lines is safe, though. - if "versioneer.py" not in simple_includes: - print(" appending 'versioneer.py' to MANIFEST.in") - with open(manifest_in, "a") as f: - f.write("include versioneer.py\n") - else: - print(" 'versioneer.py' already in MANIFEST.in") - if versionfile_source not in simple_includes: - print(" appending versionfile_source ('%s') to MANIFEST.in" % - versionfile_source) - with open(manifest_in, "a") as f: - f.write("include %s\n" % versionfile_source) - else: - print(" versionfile_source already in MANIFEST.in") - - # Make VCS-specific changes. For git, this means creating/changing - # .gitattributes to mark _version.py for export-time keyword - # substitution. - do_vcs_install(manifest_in, versionfile_source, ipy) - -def get_cmdclass(): - cmds = {'version': cmd_version, - 'versioneer': cmd_update_files, - 'build': cmd_build, - 'sdist': cmd_sdist, - } - if 'cx_Freeze' in sys.modules: # cx_freeze enabled? - cmds['build_exe'] = cmd_build_exe - del cmds['build'] - - return cmds - -def git2pep440(ver_str): - dash_count = ver_str.count('-') - if dash_count == 0: - return ver_str - elif dash_count == 1: - return ver_str.split('-')[0] + ".post.dev1.pre" - elif dash_count == 2: - tag, commits, _ = ver_str.split('-') - return ".post.dev".join([tag, commits]) - elif dash_count == 3: - tag, commits, _, _ = ver_str.split('-') - commits = str(int(commits) + 1) - return ".post.dev".join([tag, commits]) + ".pre" - else: - raise RuntimeError("Invalid version string") - -def rep_by_pep440(ver): - if ver["full"]: # only if versions_from_parentdir was not used - ver["version"] = git2pep440(ver["version"]) - else: - ver["version"] = ver["version"].split('-')[0] - return ver