rustbuild: Add a link checker for documentation

alexcrichton · alexcrichton · commit defd1b3392df · 2016-03-08T13:44:14.000-08:00
Add a script to get run which verifies that `href` links in documents are
correct. We're always getting a steady stream of "fix a broken link" PRs and
issue reports, and we should probably just nip them all in the bud.
diff --git a/src/bootstrap/bootstrap.py b/src/bootstrap/bootstrap.py
@@ -73,7 +73,8 @@ def download_rust_nightly(self):
 
         if self.rustc().startswith(self.bin_root()) and \
            (not os.path.exists(self.rustc()) or self.rustc_out_of_date()):
-            shutil.rmtree(self.bin_root())
+            if os.path.exists(self.bin_root()):
+                shutil.rmtree(self.bin_root())
             filename = "rust-std-nightly-" + self.build + ".tar.gz"
             url = "https://static.rust-lang.org/dist/" + self.snap_rustc_date()
             tarball = os.path.join(rustc_cache, filename)
diff --git a/src/bootstrap/build/check.rs b/src/bootstrap/build/check.rs
@@ -0,0 +1,21 @@
+// Copyright 2016 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use std::process::Command;
+
+use build::{Build, Compiler};
+
+pub fn linkcheck(build: &Build, stage: u32, host: &str) {
+    println!("Linkcheck stage{} ({})", stage, host);
+    let compiler = Compiler::new(stage, host);
+    let linkchecker = build.tool(&compiler, "linkchecker");
+    build.run(Command::new(&linkchecker)
+                     .arg(build.out.join(host).join("doc")));
+}
diff --git a/src/bootstrap/build/mod.rs b/src/bootstrap/build/mod.rs
@@ -30,6 +30,7 @@ macro_rules! t {
 
 mod cc;
 mod channel;
+mod check;
 mod clean;
 mod compile;
 mod config;
@@ -171,6 +172,9 @@ impl Build {
                 Rustc { stage } => {
                     compile::assemble_rustc(self, stage, target.target);
                 }
+                ToolLinkchecker { stage } => {
+                    compile::tool(self, stage, target.target, "linkchecker");
+                }
                 ToolRustbook { stage } => {
                     compile::tool(self, stage, target.target, "rustbook");
                 }
@@ -195,6 +199,10 @@ impl Build {
                     doc::rustc(self, stage, target.target, &doc_out);
                 }
 
+                CheckLinkcheck { stage } => {
+                    check::linkcheck(self, stage, target.target);
+                }
+
                 Doc { .. } | // pseudo-steps
                 Check { .. } => {}
             }
diff --git a/src/bootstrap/build/step.rs b/src/bootstrap/build/step.rs
@@ -46,6 +46,7 @@ macro_rules! targets {
             }),
 
             // Various tools that we can build as part of the build.
+            (tool_linkchecker, ToolLinkchecker { stage: u32 }),
             (tool_rustbook, ToolRustbook { stage: u32 }),
 
             // Steps for long-running native builds. Ideally these wouldn't
@@ -71,6 +72,7 @@ macro_rules! targets {
             // Steps for running tests. The 'check' target is just a pseudo
             // target to depend on a bunch of others.
             (check, Check { stage: u32, compiler: Compiler<'a> }),
+            (check_linkcheck, CheckLinkcheck { stage: u32 }),
         }
     }
 }
@@ -200,6 +202,8 @@ fn add_steps<'a>(build: &'a Build,
         }
 
         targets!(add_step);
+
+        panic!("unknown step: {}", step);
     }
 }
 
@@ -273,7 +277,15 @@ impl<'a> Step<'a> {
                      self.doc_std(stage)]
             }
             Source::Check { stage, compiler: _ } => {
-                vec![]
+                vec![self.check_linkcheck(stage)]
+            }
+            Source::CheckLinkcheck { stage } => {
+                vec![self.tool_linkchecker(stage), self.doc(stage)]
+            }
+
+            Source::ToolLinkchecker { stage } => {
+                vec![self.libstd(stage, self.compiler(stage))]
+            }
             Source::ToolRustbook { stage } => {
                 vec![self.librustc(stage, self.compiler(stage))]
             }
diff --git a/src/tools/linkchecker/Cargo.lock b/src/tools/linkchecker/Cargo.lock
diff --git a/src/tools/linkchecker/Cargo.toml b/src/tools/linkchecker/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "linkchecker"
+version = "0.1.0"
+authors = ["Alex Crichton <alex@alexcrichton.com>"]
+
+[dependencies]
+url = "0.5"
+
+[[bin]]
+name = "linkchecker"
+path = "main.rs"
diff --git a/src/tools/linkchecker/main.rs b/src/tools/linkchecker/main.rs
@@ -0,0 +1,161 @@
+// Copyright 2016 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! Script to check the validity of `href` links in our HTML documentation.
+//!
+//! In the past we've been quite error prone to writing in broken links as most
+//! of them are manually rather than automatically added. As files move over
+//! time or apis change old links become stale or broken. The purpose of this
+//! script is to check all relative links in our documentation to make sure they
+//! actually point to a valid place.
+//!
+//! Currently this doesn't actually do any HTML parsing or anything fancy like
+//! that, it just has a simple "regex" to search for `href` tags. These values
+//! are then translated to file URLs if possible and then the destination is
+//! asserted to exist.
+//!
+//! A few whitelisted exceptions are allowed as there's known bugs in rustdoc,
+//! but this should catch the majority of "broken link" cases.
+
+extern crate url;
+
+use std::env;
+use std::fs::File;
+use std::io::prelude::*;
+use std::path::Path;
+
+use url::{Url, UrlParser};
+
+macro_rules! t {
+    ($e:expr) => (match $e {
+        Ok(e) => e,
+        Err(e) => panic!("{} failed with {}", stringify!($e), e),
+    })
+}
+
+fn main() {
+    let docs = env::args().nth(1).unwrap();
+    let docs = env::current_dir().unwrap().join(docs);
+    let mut url = Url::from_file_path(&docs).unwrap();
+    let mut errors = false;
+    walk(&docs, &docs, &mut url, &mut errors);
+    if errors {
+        panic!("found some broken links");
+    }
+}
+
+fn walk(root: &Path, dir: &Path, url: &mut Url, errors: &mut bool) {
+    for entry in t!(dir.read_dir()).map(|e| t!(e)) {
+        let path = entry.path();
+        let kind = t!(entry.file_type());
+        url.path_mut().unwrap().push(entry.file_name().into_string().unwrap());
+        if kind.is_dir() {
+            walk(root, &path, url, errors);
+        } else {
+            check(root, &path, url, errors);
+        }
+        url.path_mut().unwrap().pop();
+    }
+}
+
+fn check(root: &Path, file: &Path, base: &Url, errors: &mut bool) {
+    // ignore js files as they are not prone to errors as the rest of the
+    // documentation is and they otherwise bring up false positives.
+    if file.extension().and_then(|s| s.to_str()) == Some("js") {
+        return
+    }
+
+    let pretty_file = file.strip_prefix(root).unwrap_or(file);
+
+    // Unfortunately we're not 100% full of valid links today to we need a few
+    // whitelists to get this past `make check` today.
+    if let Some(path) = pretty_file.to_str() {
+        // FIXME(#32129)
+        if path == "std/string/struct.String.html" {
+            return
+        }
+        // FIXME(#32130)
+        if path.contains("btree_set/struct.BTreeSet.html") ||
+           path == "collections/struct.BTreeSet.html" {
+            return
+        }
+        // FIXME(#31948)
+        if path.contains("ParseFloatError") {
+            return
+        }
+
+        // currently
+        if path == "std/sys/ext/index.html" {
+            return
+        }
+
+        // weird reexports, but this module is on its way out, so chalk it up to
+        // "rustdoc weirdness" and move on from there
+        if path.contains("scoped_tls") {
+            return
+        }
+    }
+
+    let mut parser = UrlParser::new();
+    parser.base_url(base);
+    let mut contents = String::new();
+    if t!(File::open(file)).read_to_string(&mut contents).is_err() {
+        return
+    }
+
+    for (i, mut line) in contents.lines().enumerate() {
+        // Search for anything that's the regex 'href[ ]*=[ ]*".*?"'
+        while let Some(j) = line.find(" href") {
+            let rest = &line[j + 5..];
+            line = rest;
+            let pos_equals = match rest.find("=") {
+                Some(i) => i,
+                None => continue,
+            };
+            if rest[..pos_equals].trim_left_matches(" ") != "" {
+                continue
+            }
+            let rest = &rest[pos_equals + 1..];
+            let pos_quote = match rest.find("\"").or_else(|| rest.find("'")) {
+                Some(i) => i,
+                None => continue,
+            };
+            if rest[..pos_quote].trim_left_matches(" ") != "" {
+                continue
+            }
+            let rest = &rest[pos_quote + 1..];
+            let url = match rest.find("\"").or_else(|| rest.find("'")) {
+                Some(i) => &rest[..i],
+                None => continue,
+            };
+
+            // Once we've plucked out the URL, parse it using our base url and
+            // then try to extract a file path. If either if these fail then we
+            // just keep going.
+            let parsed_url = match parser.parse(url) {
+                Ok(url) => url,
+                Err(..) => continue,
+            };
+            let path = match parsed_url.to_file_path() {
+                Ok(path) => path,
+                Err(..) => continue,
+            };
+
+            // Alright, if we've found a file name then this file had better
+            // exist! If it doesn't then we register and print an error.
+            if !path.exists() {
+                *errors = true;
+                print!("{}:{}: broken link - ", pretty_file.display(), i + 1);
+                let pretty_path = path.strip_prefix(root).unwrap_or(&path);
+                println!("{}", pretty_path.display());
+            }
+        }
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,7 @@ macro_rules! targets {`
`46`	`46`	`}),`
`47`	`47`
`48`	`48`	`// Various tools that we can build as part of the build.`
	`49`	`+ (tool_linkchecker, ToolLinkchecker { stage: u32 }),`
`49`	`50`	`(tool_rustbook, ToolRustbook { stage: u32 }),`
`50`	`51`
`51`	`52`	`// Steps for long-running native builds. Ideally these wouldn't`
`@@ -71,6 +72,7 @@ macro_rules! targets {`
`71`	`72`	`// Steps for running tests. The 'check' target is just a pseudo`
`72`	`73`	`// target to depend on a bunch of others.`
`73`	`74`	`(check, Check { stage: u32, compiler: Compiler<'a> }),`
	`75`	`+ (check_linkcheck, CheckLinkcheck { stage: u32 }),`
`74`	`76`	`}`
`75`	`77`	`}`
`76`	`78`	`}`
`@@ -200,6 +202,8 @@ fn add_steps<'a>(build: &'a Build,`
`200`	`202`	`}`
`201`	`203`
`202`	`204`	`targets!(add_step);`
	`205`	`+`
	`206`	`+ panic!("unknown step: {}", step);`
`203`	`207`	`}`
`204`	`208`	`}`
`205`	`209`
`@@ -273,7 +277,15 @@ impl<'a> Step<'a> {`
`273`	`277`	`self.doc_std(stage)]`
`274`	`278`	`}`
`275`	`279`	`Source::Check { stage, compiler: _ } => {`
`276`		`- vec![]`
	`280`	`+ vec![self.check_linkcheck(stage)]`
	`281`	`+ }`
	`282`	`+ Source::CheckLinkcheck { stage } => {`
	`283`	`+ vec![self.tool_linkchecker(stage), self.doc(stage)]`
	`284`	`+ }`
	`285`	`+`
	`286`	`+ Source::ToolLinkchecker { stage } => {`
	`287`	`+ vec![self.libstd(stage, self.compiler(stage))]`
	`288`	`+ }`
`277`	`289`	`Source::ToolRustbook { stage } => {`
`278`	`290`	`vec![self.librustc(stage, self.compiler(stage))]`
`279`	`291`	`}`