Skip to content

Add script to check for broken deep links #315

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 24, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
build/
.*~
.nox
*.pyc
__pycache__
6 changes: 3 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
language: python
sudo: false
python:
- 2.7
install: pip install sphinx==1.5.6
script: make SPHINXOPTS="-W" html
- 3.6
install: pip install --upgrade nox-automation virtualenv
script: nox -s build checklinks
36 changes: 36 additions & 0 deletions nox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright 2017, PyPA
# The Python Packaging User Guide is licensed under a Creative Commons
# Attribution-ShareAlike license:
# http://creativecommons.org/licenses/by-sa/3.0.

import os

import nox


@nox.session
def build(session):
session.interpreter = 'python3.6'
session.install('-r', 'requirements.txt')
# Treat warnings as errors.
session.env['SPHINXOPTS'] = '-W'
session.run('make', 'clean', 'html')


def linkmonitor(session, command):
if not os.path.exists(os.path.join('build', 'html')):
session.error('HTML output not available, run nox -s build first.')
session.interpreter = 'python3.6'
session.install('-r', 'scripts/linkmonitor/requirements.txt')
session.run(
'python', 'scripts/linkmonitor/linkmonitor.py', command)


@nox.session
def checklinks(session):
linkmonitor(session, 'check')


@nox.session
def updatelinks(session):
linkmonitor(session, 'update')
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
sphinx==1.5.6
97 changes: 97 additions & 0 deletions scripts/linkmonitor/inventory.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
[additional.html, additional.html#additional-topics, appveyor.html, appveyor.html#access-to-the-built-wheels,
appveyor.html#adding-appveyor-support-to-your-project, appveyor.html#additional-notes,
appveyor.html#appveyor-yml, appveyor.html#automatically-uploading-wheels, appveyor.html#background,
appveyor.html#contents, appveyor.html#external-dependencies, appveyor.html#setting-up,
appveyor.html#support-script, appveyor.html#support-scripts, appveyor.html#supporting-windows-using-appveyor,
appveyor.html#testing-with-tox, contribute.html, contribute.html#audience, contribute.html#contribute-to-this-guide,
contribute.html#contributing-style-guide, contribute.html#conventions-and-mechanics,
contribute.html#purpose, contribute.html#scope, contribute.html#style-guide, contribute.html#voice-and-tone,
current.html, current.html#installation-tool-recommendations, current.html#packaging-tool-recommendations,
current.html#tool-recommendations, deployment.html, deployment.html#application-bundles,
deployment.html#application-deployment, deployment.html#configuration-management,
deployment.html#contents, deployment.html#os-packaging-installers, deployment.html#overview,
deployment.html#pynsist, deployment.html#supporting-multiple-hardware-platforms,
deployment.html#windows, distributing.html, distributing.html#author, distributing.html#choosing-a-versioning-scheme,
distributing.html#classifiers, distributing.html#configuring-your-project, distributing.html#console-scripts,
distributing.html#contents, distributing.html#create-an-account, distributing.html#data-files,
distributing.html#date-based-versioning, distributing.html#description, distributing.html#entry-points,
distributing.html#hybrid-schemes, distributing.html#initial-files, distributing.html#install-requires,
distributing.html#keywords, distributing.html#license, distributing.html#local-version-identifiers,
distributing.html#manifest-in, distributing.html#name, distributing.html#package-data,
distributing.html#packages, distributing.html#packaging-and-distributing-projects,
distributing.html#packaging-your-project, distributing.html#platform-wheels, distributing.html#pre-release-versioning,
distributing.html#pure-python-wheels, distributing.html#readme-rst, distributing.html#requirements-for-packaging-and-distributing,
distributing.html#scheme-choices, distributing.html#scripts, distributing.html#semantic-versioning-preferred,
distributing.html#serial-versioning, distributing.html#setup-args, distributing.html#setup-cfg,
distributing.html#setup-name, distributing.html#setup-py, distributing.html#source-distributions,
distributing.html#standards-compliance-for-interoperability, distributing.html#universal-wheels,
distributing.html#upload-your-distributions, distributing.html#uploading-your-project-to-pypi,
distributing.html#url, distributing.html#version, distributing.html#wheels, distributing.html#working-in-development-mode,
distributing.html#your-package, extensions.html, extensions.html#alternatives-for-low-level-system-access,
extensions.html#alternatives-to-handcoded-accelerator-modules, extensions.html#alternatives-to-handcoded-wrapper-modules,
extensions.html#an-overview-of-binary-extensions, extensions.html#binary-extensions,
extensions.html#building-binary-extensions, extensions.html#contents, extensions.html#disadvantages,
extensions.html#implementing-binary-extensions, extensions.html#publishing-binary-extensions,
extensions.html#setting-up-a-build-environment-on-windows, extensions.html#use-cases,
glossary.html, glossary.html#glossary, glossary.html#term-binary-distribution, glossary.html#term-built-distribution,
glossary.html#term-distribution-package, glossary.html#term-egg, glossary.html#term-extension-module,
glossary.html#term-import-package, glossary.html#term-known-good-set-kgs, glossary.html#term-module,
glossary.html#term-package-index, glossary.html#term-per-project-index, glossary.html#term-project,
glossary.html#term-pure-module, glossary.html#term-python-package-index-pypi, glossary.html#term-python-packaging-authority-pypa,
glossary.html#term-release, glossary.html#term-requirement, glossary.html#term-requirement-specifier,
glossary.html#term-requirements-file, glossary.html#term-setup-py, glossary.html#term-source-archive,
glossary.html#term-source-distribution-or-sdist, glossary.html#term-system-package,
glossary.html#term-version-specifier, glossary.html#term-virtual-environment, glossary.html#term-wheel,
glossary.html#term-working-set, index.html, index.html#python-packaging-user-guide,
install_requirements_linux.html, install_requirements_linux.html#arch-linux, install_requirements_linux.html#centos-rhel,
install_requirements_linux.html#debian-ubuntu, install_requirements_linux.html#fedora,
install_requirements_linux.html#installing-pip-setuptools-wheel-with-linux-package-managers,
install_requirements_linux.html#opensuse, installing.html, installing.html#contents,
installing.html#creating-and-using-virtual-environments, installing.html#creating-virtual-environments,
installing.html#install-pip-setuptools-and-wheel, installing.html#installing-from-a-local-src-tree,
installing.html#installing-from-local-archives, installing.html#installing-from-other-indexes,
installing.html#installing-from-other-sources, installing.html#installing-from-pypi,
installing.html#installing-from-vcs, installing.html#installing-packages, installing.html#installing-prereleases,
installing.html#installing-requirements, installing.html#installing-setuptools-extras,
installing.html#installing-to-the-user-site, installing.html#optionally-create-a-virtual-environment,
installing.html#requirements-files, installing.html#requirements-for-installing-packages,
installing.html#source-distributions-vs-wheels, installing.html#upgrading-packages,
installing.html#use-pip-for-installing, key_projects.html, key_projects.html#bandersnatch,
key_projects.html#bento, key_projects.html#buildout, key_projects.html#conda, key_projects.html#devpi,
key_projects.html#distlib, key_projects.html#distutils, key_projects.html#easy-install,
key_projects.html#ensurepip, key_projects.html#hashdist, key_projects.html#non-pypa-projects,
key_projects.html#packaging, key_projects.html#pex, key_projects.html#pip, key_projects.html#project-summaries,
key_projects.html#projects, key_projects.html#pypa-projects, key_projects.html#python-packaging-user-guide,
key_projects.html#setuptools, key_projects.html#spack, key_projects.html#standard-library-projects,
key_projects.html#twine, key_projects.html#venv, key_projects.html#virtualenv, key_projects.html#warehouse,
key_projects.html#wheel, mirrors.html, mirrors.html#caching-with-devpi, mirrors.html#caching-with-pip,
mirrors.html#complete-mirror-with-bandersnatch, mirrors.html#contents, mirrors.html#pypi-mirrors-and-caches,
multi_version_install.html, multi_version_install.html#multi-version-installs, multiple_python_versions.html,
multiple_python_versions.html#automated-testing-and-continuous-integration, multiple_python_versions.html#contents,
multiple_python_versions.html#supporting-multiple-python-versions, multiple_python_versions.html#tools-for-single-source-python-packages,
multiple_python_versions.html#what-s-in-which-python, namespace_packages.html, namespace_packages.html#creating-a-namespace-package,
namespace_packages.html#native-namespace-packages, namespace_packages.html#packaging-namespace-packages,
namespace_packages.html#pkg-resources-style-namespace-packages, namespace_packages.html#pkgutil-style-namespace-packages,
patching.html, patching.html#patching-forking, pip_easy_install.html, pip_easy_install.html#pip-vs-easy-install,
platforms.html, platforms.html#platform-integtation, plugin_discovery.html, plugin_discovery.html#plugin-creation-and-discovery,
plugin_discovery.html#using-namespace-packages, plugin_discovery.html#using-naming-convention,
plugin_discovery.html#using-package-metadata, quickstart.html, quickstart.html#quickstart,
requirements.html, requirements.html#contents, requirements.html#install-requires,
requirements.html#install-requires-vs-requirements-files, requirements.html#requirements-files,
science.html, science.html#building-from-source, science.html#contents, science.html#installing-scientific-packages,
science.html#linux-distribution-packages, science.html#mac-os-x-installers-and-package-managers,
science.html#numpy-and-the-science-stack, science.html#scipy-distributions, science.html#spack,
science.html#the-conda-cross-platform-package-manager, science.html#windows-installers,
search.html, search.html#fallback, search.html#search-documentation, search.html#search-progress,
search.html#search-results, search.html#searchindexloader, self_hosted_repository.html,
self_hosted_repository.html#hosting-your-own-simple-repository, self_hosted_repository.html#manual-repository,
single_source_version.html, single_source_version.html#single-sourcing-the-project-version,
single_source_version.html#single-sourcing-the-version, specifications.html, specifications.html#binary-distribution-format,
specifications.html#core-metadata, specifications.html#declaring-build-system-dependencies,
specifications.html#dependency-specifiers, specifications.html#description-content-type,
specifications.html#package-distribution-metadata, specifications.html#package-index-interfaces,
specifications.html#platform-compatibility-tags, specifications.html#provides-extra-multiple-use,
specifications.html#pypa-specifications, specifications.html#recording-installed-distributions,
specifications.html#simple-repository-api, specifications.html#source-distribution-format,
specifications.html#specifications, specifications.html#version-specifiers, support.html,
support.html#how-to-get-support, tutorial.html, wheel_egg.html, wheel_egg.html#wheel-vs-egg]
190 changes: 190 additions & 0 deletions scripts/linkmonitor/linkmonitor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
# Copyright 2017, PyPA
# The Python Packaging User Guide is licensed under a Creative Commons
# Attribution-ShareAlike license:
# http://creativecommons.org/licenses/by-sa/3.0.

import argparse
from glob import glob
import io
import os
import sys

from bs4 import BeautifulSoup
import yaml

HERE = os.path.abspath(os.path.dirname(__file__))
INVENTORY_FILENAME = os.path.join(HERE, 'inventory.yaml')
REDIRECTS_FILENAME = os.path.join(HERE, 'redirects.yaml')
ROOT = os.path.abspath(os.path.join(HERE, '..', '..'))
HTML_DIR = os.path.join(ROOT, 'build', 'html')
IGNORED_FILES = [
'genindex.html'
]


def find_all_named_anchors(filename):
links = set()

with io.open(filename, 'r') as html_file:
soup = BeautifulSoup(html_file, 'html.parser')

for tag in soup.find_all(id=True):
anchor = tag['id']
# Ignore non-named IDs.
if anchor.startswith('id'):
continue
# Ignore index anchors
if anchor.startswith('index-'):
continue
# Ignore searchbox anchors
if anchor == 'searchbox':
continue

href = '{}#{}'.format(filename, anchor)
links.add(href)

return links


def find_all_named_anchors_in_files(files):
links = set()

for filename in files:
links.add(filename)
anchors = find_all_named_anchors(filename)
links.update(anchors)

return links


def find_links():
files = glob('**/*.html', recursive=True)
files = filter(lambda name: name not in IGNORED_FILES, files)
return find_all_named_anchors_in_files(files)


def load_inventory():
if not os.path.exists(INVENTORY_FILENAME):
return set()
with io.open(INVENTORY_FILENAME, 'r') as inventory_file:
return set(yaml.load(inventory_file))


def save_inventory(inventory):
with io.open(INVENTORY_FILENAME, 'w') as inventory_file:
yaml.dump(sorted(list(inventory)), inventory_file)


def load_redirects():
with io.open(REDIRECTS_FILENAME, 'r') as redirects_file:
return yaml.load(redirects_file)


def expand_redirects(redirects, inventory):
valid_redirects = set()
missing_redirects = set()

for redirect in redirects:
from_ = redirect['from']
source_links = set()

# Get all links that start with the page. This gathers all deep links.
# For example, the redirect may be old.html -> new.html. old.html may
# have had #1, #2, #3. We need to get all of those deep links.
for link in inventory:
if link.startswith(from_):
source_links.add(link)

# Make sure all of the source links have a counterpart in the
# destination page. For the example above, new.html needs to have #1
# #2 and #3 as well.
for source_link in source_links:
dest_link = source_link.replace(from_, redirect['to'])
if dest_link in inventory:
valid_redirects.add(source_link)
else:
missing_redirects.add((source_link, dest_link))

return valid_redirects, missing_redirects


def update_command(args):
"""Updates the current inventory of links with any new links added.

This should be run after adding new documentation to make a record of new
items added.
"""
os.chdir(HTML_DIR)

inventory = load_inventory()
links = find_links()

new_links = links.difference(inventory)
print('Found {} new links.'.format(len(new_links)))

inventory.update(links)
save_inventory(inventory)

return 0


def check_command(args):
"""Checks the current set of links against the inventory.

This should be run on every documentation change to ensure that no deep
links have been broken and that new links are tracked in the inventory.
"""
os.chdir(HTML_DIR)

# TODO: Add another file to list currently defined redirects.
inventory = load_inventory()
redirects = load_redirects()
links = find_links()

valid_redirects, missing_redirects = expand_redirects(redirects, inventory)
if missing_redirects:
print(
'The following redirects are missing deep link anchors in the '
'destination:')
for source, dest in missing_redirects:
print(' * {} -> {}'.format(source, dest))

missing_links = inventory.difference(links)
missing_links -= valid_redirects

if missing_links:
print('Missing the following deep links:')
for link in missing_links:
print(' * {}'.format(link))
return 1

new_links = links.difference(inventory)

if new_links:
print('The following new deep links were added:')
for link in new_links:
print(' * {}'.format(link))
print('Run nox -s updatelinks to update them in git.')
return 2

print('All is well')
return 0


if __name__ == '__main__':
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers()
update_parser = subparsers.add_parser(
'update', help=update_command.__doc__)
update_parser.set_defaults(func=update_command)
check_parser = subparsers.add_parser(
'check', help=check_command.__doc__)
check_parser.set_defaults(func=check_command)

args = parser.parse_args()

if not hasattr(args, 'func'):
parser.print_help()
sys.exit(1)

sys.exit(args.func(args))
7 changes: 7 additions & 0 deletions scripts/linkmonitor/redirects.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Unfortunately, readthedocs doesn't allow us to specify redirects in yaml.
# They have to be individually added in the readthedocs UI. This file should
# match the configuration of page redirects in the UI.
# See also: https://github.com/rtfd/readthedocs.org/issues/2904

- from: old.html
to: new.html
2 changes: 2 additions & 0 deletions scripts/linkmonitor/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
beautifulsoup4==4.6.0
PyYAML==3.12