Improve documentation

glyn · glyn · commit 3b0002a1b6ed · 2024-04-19T12:06:10.000+01:00
And tidy up.
diff --git a/Cargo.toml b/Cargo.toml
@@ -18,4 +18,3 @@ path = "src/robot.rs"
 ngx = { git = "https://github.com/glyn/ngx-rust",tag="pre-0.5.0"}
 robotstxt = "0.3.0"
 tokio = { version = "1.33.0", features = ["full"] }
-
diff --git a/README.md b/README.md
@@ -3,17 +3,70 @@
 This NGINX module enforces the rules in `robots.txt` for web crawlers that choose
 to disregard those rules.
 
-### WORK IN PROGRESS
+Regardless of the rules in `robots.txt`, the module always allows the path `/robots.txt` to be accessed.
+This gives web crawlers the _option_ of obeying `robots.txt`.
+If any other files should always be accessible, these should be made available via `robots.txt`.
 
-The current code builds but has not been tested and is missing major pieces of function.
-See [Configuration support](https://github.com/glyn/nginx_robot_access/issues/1) in particular.
+See the following instructions for how to build and configure the module.
 
-### Contributing
+## Building
+
+This module is written in Rust. After [installing Rust](https://www.rust-lang.org/tools/install),
+the module may be built by issuing the following command in the root directory of a clone of this repository:
+~~~
+cargo build --release
+~~~
+
+This will build a shared library in `target/release`.
+
+## Configuration
+
+To enable this module, it must be loaded in the NGINX configuration, e.g.:
+~~~
+load_module /var/lib/libnginx_robot_access.so;
+~~~
+
+For this module to work correctly, the absolute file path of `robots.txt` must be configured in the NGINX configuration using the directive `robots_txt_path`. The directive takes a single argument: the absolute file path of `robots.txt`, e.g.:
+~~~
+robots_txt_path /etc/robots.txt;
+~~~
+
+The directive may be specified in any of the `http`, `server`, or `location` configuration blocks.
+Configuring the directive in the `location` block overrides any configuration of the directive in the `server` block. Configuring the directive in the `server` block overrides any configuration in the `http` block.
+
+For example, here's a simple configuration that enables the module and sets the path to `/etc/robots.txt`:
+~~~
+load_module /var/lib/libnginx_robot_access.so;
+...
+http {
+    ...
+    server {
+        ...
+        location / {
+            ...
+            robots_txt_path /etc/robots.txt;
+        }
+...
+~~~
+
+## Debugging
+
+Some debug logging is included in the module. To use this, enable debug logging in the NGINX configuration, e.g.:
+~~~
+error_log  logs/error.log debug;
+~~~
+
+## Contributing
 
 See the [Contributor Guide](./CONTRIBUTING.md) if you'd like to submit changes.
 
-### Alternatives
+## Acknowledgements
+
+* [ngx-rust](https://github.com/nginxinc/ngx-rust): a Rust binding for NGINX.
+* [robotstxt](https://github.com/Folyd/robotstxt): a Rust port of Google's [C++ implementation](https://github.com/google/robotstxt). Thanks @Folyd!
+
+## Alternatives
 
-* Configure NGINX to [block specific user agents](https://www.xmodulo.com/block-specific-user-agents-nginx-web-server.html), although this doesn't share the configuration in `robots.txt`
-* [NGINX configuration for AI web crawlers](https://github.com/ai-robots-txt/ai.robots.txt/blob/main/servers/nginx.conf)
-* [Roboo](https://github.com/yuri-gushin/Roboo) protects against robots that do not implement certain browser features
+* Configure NGINX to [block specific user agents](https://www.xmodulo.com/block-specific-user-agents-nginx-web-server.html), although this doesn't share the configuration in `robots.txt`.
+* [NGINX configuration for AI web crawlers](https://github.com/ai-robots-txt/ai.robots.txt/blob/main/servers/nginx.conf), but again this doesn't share the configuration in `robots.txt`.
+* [Roboo](https://github.com/yuri-gushin/Roboo) is an NGINX module which protects against robots that fail to implement certain browser features.
diff --git a/src/robot.rs b/src/robot.rs
@@ -1,8 +1,9 @@
+// This module was based closely on the curl example module from ngx-rust.
 use ngx::ffi::{
     nginx_version, ngx_array_push, ngx_command_t, ngx_conf_t, ngx_http_core_module, ngx_http_handler_pt,
     ngx_http_module_t, ngx_http_phases_NGX_HTTP_ACCESS_PHASE, ngx_http_request_t, ngx_int_t, ngx_module_t, ngx_str_t,
-    ngx_uint_t, NGX_CONF_TAKE1, NGX_HTTP_LOC_CONF, NGX_HTTP_MODULE, NGX_RS_HTTP_LOC_CONF_OFFSET,
-    NGX_RS_MODULE_SIGNATURE,
+    ngx_uint_t, NGX_CONF_TAKE1, NGX_HTTP_MAIN_CONF, NGX_HTTP_SRV_CONF, NGX_HTTP_LOC_CONF, NGX_HTTP_MODULE,
+    NGX_RS_HTTP_LOC_CONF_OFFSET, NGX_RS_MODULE_SIGNATURE,
 };
 use ngx::http::MergeConfigError;
 use ngx::{core, core::Status, http, http::HTTPModule};
@@ -36,15 +37,21 @@ impl http::HTTPModule for Module {
 
 #[derive(Debug, Default)]
 struct ModuleConfig {
-    robots_txt_path: String,
-    robots_txt_contents: String,
+    robots_txt_path: String, // absolute file path of robots.txt
+    robots_txt_contents: String, // the contents of robots.txt, read by this module from robots_txt_path
 }
 
 #[no_mangle]
 static mut ngx_http_robots_commands: [ngx_command_t; 2] = [
+    // define the robots_txt_path configuration directive
     ngx_command_t {
         name: ngx_string!("robots_txt_path"),
-        type_: (NGX_HTTP_LOC_CONF | NGX_CONF_TAKE1) as ngx_uint_t,
+        // The directive may appear in the http, server, or location block and takes
+        // a single argument (the absolute file path of robots.txt). 
+        type_: ( NGX_HTTP_MAIN_CONF
+               | NGX_HTTP_SRV_CONF
+               | NGX_HTTP_LOC_CONF
+               | NGX_CONF_TAKE1 ) as ngx_uint_t,
         set: Some(ngx_http_robots_commands_set_robots_txt_path),
         conf: NGX_RS_HTTP_LOC_CONF_OFFSET,
         offset: 0,
@@ -101,15 +108,21 @@ pub static mut ngx_http_robots_module: ngx_module_t = ngx_module_t {
 
 impl http::Merge for ModuleConfig {
     fn merge(&mut self, prev: &ModuleConfig) -> Result<(), MergeConfigError> {
-        // If robots.txt path is not set at this level, inherit the setting from the higher level
+        // If robots.txt path is not set at this level, inherit the setting from the higher level.
+        // This means that configuring the directive in the location block overrides any configuration
+        // of the directive in the server block and that configuring the directive in the server block
+        // overrides any configuration in the http block.
         if self.robots_txt_path == "" {
             self.robots_txt_path = prev.robots_txt_path.to_string();
         }
+        
         self.robots_txt_contents = "".to_string(); // default value
+        
         // If robots.txt path has been set, store the contents of the file
         if self.robots_txt_path != "" {
             self.robots_txt_contents = fs::read_to_string(&self.robots_txt_path).unwrap();
         }
+
         Ok(())
     }
 }
@@ -179,9 +192,11 @@ extern "C" fn ngx_http_robots_commands_set_robots_txt_path(
     std::ptr::null_mut()
 }
 
-/// Extract the matchable part of a user agent string, essentially stopping at
-/// the first invalid character.
-/// Example: 'Googlebot/2.1' becomes 'Googlebot'
+// Extract the matchable part of a user agent string, essentially stopping at
+// the first invalid character.
+// Example: 'Googlebot/2.1' becomes 'Googlebot'
+//
+// This function and its unit tests were inherited from robotstxt. 
 fn extract_user_agent(user_agent: &str) -> &str {
     // Allowed characters in user-agent are [a-zA-Z_-].
     if let Some(end) =