Skip to content

Commit 3b0002a

Browse files
committed
Improve documentation
And tidy up.
1 parent eb2de7d commit 3b0002a

File tree

3 files changed

+85
-18
lines changed

3 files changed

+85
-18
lines changed

Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,3 @@ path = "src/robot.rs"
1818
ngx = { git = "https://github.com/glyn/ngx-rust",tag="pre-0.5.0"}
1919
robotstxt = "0.3.0"
2020
tokio = { version = "1.33.0", features = ["full"] }
21-

README.md

Lines changed: 61 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,70 @@
33
This NGINX module enforces the rules in `robots.txt` for web crawlers that choose
44
to disregard those rules.
55

6-
### WORK IN PROGRESS
6+
Regardless of the rules in `robots.txt`, the module always allows the path `/robots.txt` to be accessed.
7+
This gives web crawlers the _option_ of obeying `robots.txt`.
8+
If any other files should always be accessible, these should be made available via `robots.txt`.
79

8-
The current code builds but has not been tested and is missing major pieces of function.
9-
See [Configuration support](https://github.com/glyn/nginx_robot_access/issues/1) in particular.
10+
See the following instructions for how to build and configure the module.
1011

11-
### Contributing
12+
## Building
13+
14+
This module is written in Rust. After [installing Rust](https://www.rust-lang.org/tools/install),
15+
the module may be built by issuing the following command in the root directory of a clone of this repository:
16+
~~~
17+
cargo build --release
18+
~~~
19+
20+
This will build a shared library in `target/release`.
21+
22+
## Configuration
23+
24+
To enable this module, it must be loaded in the NGINX configuration, e.g.:
25+
~~~
26+
load_module /var/lib/libnginx_robot_access.so;
27+
~~~
28+
29+
For this module to work correctly, the absolute file path of `robots.txt` must be configured in the NGINX configuration using the directive `robots_txt_path`. The directive takes a single argument: the absolute file path of `robots.txt`, e.g.:
30+
~~~
31+
robots_txt_path /etc/robots.txt;
32+
~~~
33+
34+
The directive may be specified in any of the `http`, `server`, or `location` configuration blocks.
35+
Configuring the directive in the `location` block overrides any configuration of the directive in the `server` block. Configuring the directive in the `server` block overrides any configuration in the `http` block.
36+
37+
For example, here's a simple configuration that enables the module and sets the path to `/etc/robots.txt`:
38+
~~~
39+
load_module /var/lib/libnginx_robot_access.so;
40+
...
41+
http {
42+
...
43+
server {
44+
...
45+
location / {
46+
...
47+
robots_txt_path /etc/robots.txt;
48+
}
49+
...
50+
~~~
51+
52+
## Debugging
53+
54+
Some debug logging is included in the module. To use this, enable debug logging in the NGINX configuration, e.g.:
55+
~~~
56+
error_log logs/error.log debug;
57+
~~~
58+
59+
## Contributing
1260

1361
See the [Contributor Guide](./CONTRIBUTING.md) if you'd like to submit changes.
1462

15-
### Alternatives
63+
## Acknowledgements
64+
65+
* [ngx-rust](https://github.com/nginxinc/ngx-rust): a Rust binding for NGINX.
66+
* [robotstxt](https://github.com/Folyd/robotstxt): a Rust port of Google's [C++ implementation](https://github.com/google/robotstxt). Thanks @Folyd!
67+
68+
## Alternatives
1669

17-
* Configure NGINX to [block specific user agents](https://www.xmodulo.com/block-specific-user-agents-nginx-web-server.html), although this doesn't share the configuration in `robots.txt`
18-
* [NGINX configuration for AI web crawlers](https://github.com/ai-robots-txt/ai.robots.txt/blob/main/servers/nginx.conf)
19-
* [Roboo](https://github.com/yuri-gushin/Roboo) protects against robots that do not implement certain browser features
70+
* Configure NGINX to [block specific user agents](https://www.xmodulo.com/block-specific-user-agents-nginx-web-server.html), although this doesn't share the configuration in `robots.txt`.
71+
* [NGINX configuration for AI web crawlers](https://github.com/ai-robots-txt/ai.robots.txt/blob/main/servers/nginx.conf), but again this doesn't share the configuration in `robots.txt`.
72+
* [Roboo](https://github.com/yuri-gushin/Roboo) is an NGINX module which protects against robots that fail to implement certain browser features.

src/robot.rs

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1+
// This module was based closely on the curl example module from ngx-rust.
12
use ngx::ffi::{
23
nginx_version, ngx_array_push, ngx_command_t, ngx_conf_t, ngx_http_core_module, ngx_http_handler_pt,
34
ngx_http_module_t, ngx_http_phases_NGX_HTTP_ACCESS_PHASE, ngx_http_request_t, ngx_int_t, ngx_module_t, ngx_str_t,
4-
ngx_uint_t, NGX_CONF_TAKE1, NGX_HTTP_LOC_CONF, NGX_HTTP_MODULE, NGX_RS_HTTP_LOC_CONF_OFFSET,
5-
NGX_RS_MODULE_SIGNATURE,
5+
ngx_uint_t, NGX_CONF_TAKE1, NGX_HTTP_MAIN_CONF, NGX_HTTP_SRV_CONF, NGX_HTTP_LOC_CONF, NGX_HTTP_MODULE,
6+
NGX_RS_HTTP_LOC_CONF_OFFSET, NGX_RS_MODULE_SIGNATURE,
67
};
78
use ngx::http::MergeConfigError;
89
use ngx::{core, core::Status, http, http::HTTPModule};
@@ -36,15 +37,21 @@ impl http::HTTPModule for Module {
3637

3738
#[derive(Debug, Default)]
3839
struct ModuleConfig {
39-
robots_txt_path: String,
40-
robots_txt_contents: String,
40+
robots_txt_path: String, // absolute file path of robots.txt
41+
robots_txt_contents: String, // the contents of robots.txt, read by this module from robots_txt_path
4142
}
4243

4344
#[no_mangle]
4445
static mut ngx_http_robots_commands: [ngx_command_t; 2] = [
46+
// define the robots_txt_path configuration directive
4547
ngx_command_t {
4648
name: ngx_string!("robots_txt_path"),
47-
type_: (NGX_HTTP_LOC_CONF | NGX_CONF_TAKE1) as ngx_uint_t,
49+
// The directive may appear in the http, server, or location block and takes
50+
// a single argument (the absolute file path of robots.txt).
51+
type_: ( NGX_HTTP_MAIN_CONF
52+
| NGX_HTTP_SRV_CONF
53+
| NGX_HTTP_LOC_CONF
54+
| NGX_CONF_TAKE1 ) as ngx_uint_t,
4855
set: Some(ngx_http_robots_commands_set_robots_txt_path),
4956
conf: NGX_RS_HTTP_LOC_CONF_OFFSET,
5057
offset: 0,
@@ -101,15 +108,21 @@ pub static mut ngx_http_robots_module: ngx_module_t = ngx_module_t {
101108

102109
impl http::Merge for ModuleConfig {
103110
fn merge(&mut self, prev: &ModuleConfig) -> Result<(), MergeConfigError> {
104-
// If robots.txt path is not set at this level, inherit the setting from the higher level
111+
// If robots.txt path is not set at this level, inherit the setting from the higher level.
112+
// This means that configuring the directive in the location block overrides any configuration
113+
// of the directive in the server block and that configuring the directive in the server block
114+
// overrides any configuration in the http block.
105115
if self.robots_txt_path == "" {
106116
self.robots_txt_path = prev.robots_txt_path.to_string();
107117
}
118+
108119
self.robots_txt_contents = "".to_string(); // default value
120+
109121
// If robots.txt path has been set, store the contents of the file
110122
if self.robots_txt_path != "" {
111123
self.robots_txt_contents = fs::read_to_string(&self.robots_txt_path).unwrap();
112124
}
125+
113126
Ok(())
114127
}
115128
}
@@ -179,9 +192,11 @@ extern "C" fn ngx_http_robots_commands_set_robots_txt_path(
179192
std::ptr::null_mut()
180193
}
181194

182-
/// Extract the matchable part of a user agent string, essentially stopping at
183-
/// the first invalid character.
184-
/// Example: 'Googlebot/2.1' becomes 'Googlebot'
195+
// Extract the matchable part of a user agent string, essentially stopping at
196+
// the first invalid character.
197+
// Example: 'Googlebot/2.1' becomes 'Googlebot'
198+
//
199+
// This function and its unit tests were inherited from robotstxt.
185200
fn extract_user_agent(user_agent: &str) -> &str {
186201
// Allowed characters in user-agent are [a-zA-Z_-].
187202
if let Some(end) =

0 commit comments

Comments
 (0)