diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..289c226 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,51 @@ +# Git +.git +.gitignore + +# Documentation +README.md +*.md + +# Build artifacts +fetch-mcp-server +*.exe +*.dll +*.so +*.dylib + +# Test files +*_test.go +test* +*test* + +# IDE and editor files +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS files +.DS_Store +Thumbs.db + +# Temporary files +*.tmp +*.temp +.cache/ + +# Logs +*.log + +# Docker +Dockerfile* +.dockerignore + +# CI/CD +.github/ +.gitlab-ci.yml +.travis.yml +.circleci/ + +# Dependencies (will be downloaded in container) +vendor/ \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..b931bdd --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,44 @@ +name: Build artifacts + +on: + workflow_call: + +permissions: + contents: read + +jobs: + + build: + name: Build and Test + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + + - name: Set up Go + uses: actions/setup-go@0aaccfd150d50ccaeb58ebd88d36e91967a5f35b # v5 + with: + go-version-file: 'go.mod' + cache: true + + - name: Install Task + uses: arduino/setup-task@v2 + with: + version: '3.x' + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Install dependencies + run: task install + + - name: Build + run: task build + + - name: Test + run: task test + + - name: Upload build artifacts + uses: actions/upload-artifact@v4 + with: + name: fetch-server + path: build/fetch-server + retention-days: 7 \ No newline at end of file diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..bb172d1 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,25 @@ +name: Linting + +on: + workflow_call: + +permissions: + contents: read + +jobs: + lint: + name: Lint + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + + - name: Set up Go + uses: actions/setup-go@0aaccfd150d50ccaeb58ebd88d36e91967a5f35b # v5 + with: + go-version-file: 'go.mod' + cache: true + + - name: Run golangci-lint + uses: golangci/golangci-lint-action@4afd733a84b1f43292c63897423277bb7f4313a9 # v8.0.0 + with: + args: --timeout=5m \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..8d71c71 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,87 @@ +name: Release + +on: + push: + tags: + - 'v*' + +jobs: + release: + name: Release Container + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + id-token: write + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: 'go.mod' + cache: true + + - name: Install Task + uses: arduino/setup-task@v2 + with: + version: '3.x' + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Install dependencies + run: task install + + - name: Test + run: task test + + - name: Setup Ko + uses: ko-build/setup-ko@v0.9 + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract tag version + id: tag + run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT + + - name: Set repository owner lowercase + id: repo_owner + run: echo "OWNER=$(echo ${{ github.repository_owner }} | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT + + - name: Build and push container + env: + KO_DOCKER_REPO: ghcr.io/${{ steps.repo_owner.outputs.OWNER }}/fetch + VERSION: ${{ steps.tag.outputs.VERSION }} + CREATION_TIME: $(date -u +'%Y-%m-%dT%H:%M:%SZ') + run: | + # Build and push the container with reproducible build flags + ko build \ + --bare \ + --sbom=spdx \ + --platform=linux/amd64,linux/arm64 \ + --base-import-paths \ + --tags $VERSION,latest \ + ./cmd/server + + - name: Install Cosign + uses: sigstore/cosign-installer@3454372f43399081ed03b604cb2d021dabca52bb # v3.8.2 + + - name: Sign Image with Cosign + env: + KO_DOCKER_REPO: ghcr.io/${{ steps.repo_owner.outputs.OWNER }}/fetch + run: | + TAG=$(echo "${{ steps.tag.outputs.VERSION }}" | sed 's/+/_/g') + # Sign the ko image + cosign sign -y $KO_DOCKER_REPO/server:$TAG + + # Sign the latest tag if building from a tag + if [[ "${{ github.ref }}" == refs/tags/* ]]; then + cosign sign -y $KO_DOCKER_REPO/server:latest + fi \ No newline at end of file diff --git a/.github/workflows/run-on-main.yml b/.github/workflows/run-on-main.yml new file mode 100644 index 0000000..6c032c4 --- /dev/null +++ b/.github/workflows/run-on-main.yml @@ -0,0 +1,20 @@ +# These set of workflows run on every push to the main branch +name: Main build +permissions: + contents: read + +on: + workflow_dispatch: + push: + branches: [ main ] + +jobs: + linting: + name: Linting + uses: ./.github/workflows/lint.yml + tests: + name: Tests + uses: ./.github/workflows/test.yml + build: + name: Build + uses: ./.github/workflows/build.yml \ No newline at end of file diff --git a/.github/workflows/run-on-pr.yml b/.github/workflows/run-on-pr.yml new file mode 100644 index 0000000..89d34f8 --- /dev/null +++ b/.github/workflows/run-on-pr.yml @@ -0,0 +1,16 @@ +# These set of workflows run on every push to the main branch +name: PR Checks +permissions: + contents: read + +on: + workflow_dispatch: + pull_request: + +jobs: + linting: + name: Linting + uses: ./.github/workflows/lint.yml + tests: + name: Tests + uses: ./.github/workflows/test.yml \ No newline at end of file diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..43c546b --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,29 @@ +name: Tests + +on: + workflow_call: + +permissions: + contents: read + +jobs: + test: + name: Test + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + + - name: Set up Go + uses: actions/setup-go@0aaccfd150d50ccaeb58ebd88d36e91967a5f35b # v5 + with: + go-version-file: 'go.mod' + cache: true + + - name: Install Task + uses: arduino/setup-task@v2 + with: + version: '3.x' + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Test + run: task test \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d71ce80 --- /dev/null +++ b/.gitignore @@ -0,0 +1,36 @@ +# Binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib + +# Test binary, built with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out + +# Dependency directories (remove the comment below to include it) +# vendor/ + +# Go workspace file +go.work + +# Build directory +/build/ + +# IDE specific files +.idea/ +.vscode/ +*.swp +*.swo + +# OS specific files +.DS_Store +Thumbs.db + +# Kubeconfig files +kubeconfig +.kubeconfig +**/.claude/settings.local.json \ No newline at end of file diff --git a/.golangci.yml b/.golangci.yml new file mode 100644 index 0000000..cac1735 --- /dev/null +++ b/.golangci.yml @@ -0,0 +1,106 @@ +version: "2" +run: + issues-exit-code: 1 +output: + formats: + text: + path: stdout + print-linter-name: true + print-issued-lines: true +linters: + default: none + enable: + - depguard + - exhaustive + - goconst + - gocyclo + - gosec + - govet + - ineffassign + - lll + - paralleltest + - promlinter + - revive + - staticcheck + - thelper + - tparallel + - unparam + - unused + settings: + depguard: + rules: + prevent_unmaintained_packages: + list-mode: lax + files: + - $all + - '!$test' + deny: + - pkg: io/ioutil + desc: this is deprecated + gocyclo: + min-complexity: 15 + gosec: + excludes: + - G601 + lll: + line-length: 130 + revive: + severity: warning + rules: + - name: blank-imports + severity: warning + - name: context-as-argument + - name: context-keys-type + - name: duplicated-imports + - name: error-naming + - name: error-return + - name: exported + severity: error + - name: if-return + - name: identical-branches + - name: indent-error-flow + - name: import-shadowing + - name: package-comments + - name: redefines-builtin-id + - name: struct-tag + - name: unconditional-recursion + - name: unnecessary-stmt + - name: unreachable-code + - name: unused-parameter + - name: unused-receiver + - name: unhandled-error + disabled: true + exclusions: + generated: lax + rules: + - linters: + - lll + - gocyclo + - errcheck + - dupl + - gosec + - paralleltest + path: (.+)_test\.go + - linters: + - lll + path: .golangci.yml + paths: + - third_party$ + - builtin$ + - examples$ +formatters: + enable: + - gci + - gofmt + settings: + gci: + sections: + - standard + - default + - prefix(github.com/StacklokLabs/fetch) + exclusions: + generated: lax + paths: + - third_party$ + - builtin$ + - examples$ \ No newline at end of file diff --git a/.ko.yaml b/.ko.yaml new file mode 100644 index 0000000..5296cdb --- /dev/null +++ b/.ko.yaml @@ -0,0 +1,15 @@ +builds: +- id: fetch-server + dir: . + main: ./cmd/server + ldflags: + - -X main.version={{.Env.VERSION}} + labels: + org.opencontainers.image.created: "{{.Env.CREATION_TIME}}" + org.opencontainers.image.description: "fetch - A HTTP fetching MCP server." + org.opencontainers.image.licenses: "Apache-2.0" + org.opencontainers.image.revision: "{{.Env.GITHUB_SHA}}" + org.opencontainers.image.source: "{{.Env.GITHUB_SERVER_URL}}/{{.Env.GITHUB_REPOSITORY}}" + org.opencontainers.image.title: "fetch" + org.opencontainers.image.url: "{{.Env.GITHUB_SERVER_URL}}/{{.Env.GITHUB_REPOSITORY}}" + org.opencontainers.image.version: "{{.Env.VERSION}}" \ No newline at end of file diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..a2b909e --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,74 @@ +# Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, gender identity and expression, level of experience, +nationality, personal appearance, race, religion, or sexual identity and +orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at . All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at [http://contributor-covenant.org/version/1/4][version] + +[homepage]: http://contributor-covenant.org +[version]: http://contributor-covenant.org/version/1/4/ \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..4c1c766 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,83 @@ +# Contributing to `fetch`` + +First off, thank you for taking the time to contribute to `fetch`! :+1: :tada: +`fetch` is released under the Apache 2.0 license. If you would like to +contribute something or want to hack on the code, this document should help you +get started. You can find some hints for starting development in `fetch`'s +[README](https://github.com/StacklokLabs/fetch/blob/main/README.md). + +## Table of contents + +- [Code of conduct](#code-of-conduct) +- [Reporting security vulnerabilities](#reporting-security-vulnerabilities) +- [How to contribute](#how-to-contribute) + - [Using GitHub Issues](#using-github-issues) + - [Not sure how to start contributing?](#not-sure-how-to-start-contributing) + - [Pull request process](#pull-request-process) + - [Contributing to docs](#contributing-to-docs) + - [Commit message guidelines](#commit-message-guidelines) + +## Code of conduct + +This project adheres to the +[Contributor Covenant](https://github.com/StacklokLabs/fetch/blob/main/CODE_OF_CONDUCT.md) +code of conduct. By participating, you are expected to uphold this code. Please +report unacceptable behavior to +[code-of-conduct@stacklok.dev](mailto:code-of-conduct@stacklok.dev). + +## Reporting security vulnerabilities + +If you think you have found a security vulnerability in fetch please DO NOT +disclose it publicly until we've had a chance to fix it. Please don't report +security vulnerabilities using GitHub issues; instead, please follow this +[process](https://github.com/StacklokLabs/fetch/blob/main/SECURITY.md) + +## How to contribute + +### Using GitHub Issues + +We use GitHub issues to track bugs and enhancements. If you have a general usage +question, please ask in +[fetch's discussion forum](https://discord.gg/stacklok). + +If you are reporting a bug, please help to speed up problem diagnosis by +providing as much information as possible. Ideally, that would include a small +sample project that reproduces the problem. + +### Not sure how to start contributing? + +PRs to resolve existing issues are greatly appreciated and issues labeled as +["good first issue"](https://github.com/StacklokLabs/fetch/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) +are a great place to start! + +### Pull request process +-All commits must include a Signed-off-by trailer at the end of each commit message to indicate that the contributor agrees to the Developer Certificate of Origin. For additional details, check out the [DCO instructions](dco.md). + +- Create an issue outlining the fix or feature. +- Fork the fetch repository to your own GitHub account and clone it locally. +- Hack on your changes. +- Correctly format your commit messages, see + [Commit message guidelines](#commit-message-guidelines) below. +- Open a PR by ensuring the title and its description reflect the content of the + PR. +- Ensure that CI passes, if it fails, fix the failures. +- Every pull request requires a review from the core fetch team before + merging. +- Once approved, all of your commits will be squashed into a single commit with + your PR title. + +### Contributing to docs + +TBD + +### Commit message guidelines + +We follow the commit formatting recommendations found on +[Chris Beams' How to Write a Git Commit Message article](https://chris.beams.io/posts/git-commit/): + +1. Separate subject from body with a blank line +1. Limit the subject line to 50 characters +1. Capitalize the subject line +1. Do not end the subject line with a period +1. Use the imperative mood in the subject line +1. Use the body to explain what and why vs. how \ No newline at end of file diff --git a/LICENSE b/LICENSE index 261eeb9..e98c401 100644 --- a/LICENSE +++ b/LICENSE @@ -1,201 +1,156 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. +# Security Policy + +The ToolHive community take security seriously! We appreciate your efforts to +disclose your findings responsibly and will make every effort to acknowledge +your contributions. + +## Reporting a vulnerability + +To report a security issue, please use the GitHub Security Advisory +["Report a Vulnerability"](https://github.com/StacklokLabs/toolhive/security/advisories/new) +tab. + +If you are unable to access GitHub you can also email us at +[security@stacklok.com](mailto:security@stacklok.com). + +Include steps to reproduce the vulnerability, the vulnerable versions, and any +additional files to reproduce the vulnerability. + +If you are only comfortable sharing under GPG, please start by sending an email +requesting a public PGP key to use for encryption. + +### Contacting the ToolHive security team + +Contact the team by sending email to +[security@stacklok.com](mailto:security@stacklok.com). + +## Disclosures + +### Private disclosure processes + +The ToolHive community asks that all suspected vulnerabilities be handled in +accordance with +[Responsible Disclosure model](https://en.wikipedia.org/wiki/Responsible_disclosure). + +### Public disclosure processes + +If anyone knows of a publicly disclosed security vulnerability please +IMMEDIATELY email [security@stacklok.com](mailto:security@stacklok.com) to +inform us about the vulnerability so that we may start the patch, release, and +communication process. + +If a reporter contacts the us to express intent to make an issue public before a +fix is available, we will request if the issue can be handled via a private +disclosure process. If the reporter denies the request, we will move swiftly +with the fix and release process. + +## Patch, release, and public communication + +For each vulnerability, the ToolHive security team will coordinate to create the +fix and release, and notify the rest of the community. + +All of the timelines below are suggestions and assume a Private Disclosure. + +- The security team drives the schedule using their best judgment based on + severity, development time, and release work. +- If the security team is dealing with a Public Disclosure all timelines become + ASAP. +- If the fix relies on another upstream project's disclosure timeline, that will + adjust the process as well. +- We will work with the upstream project to fit their timeline and best protect + ToolHive users. +- The Security team will give advance notice to the Private Distributors list + before the fix is released. + +### Fix team organization + +These steps should be completed within the first 24 hours of Disclosure. + +- The security team will work quickly to identify relevant engineers from the + affected projects and packages and being those engineers into the + [security advisory](https://docs.github.com/en/code-security/security-advisories/) + thread. +- These selected developers become the "Fix Team" (the fix team is often drawn + from the projects MAINTAINERS) + +### Fix development process + +These steps should be completed within the 1-7 days of Disclosure. + +- Create a new + [security advisory](https://docs.github.com/en/code-security/security-advisories/) + in affected repository by visiting + `https://github.com/StacklokLabs/toolhive/security/advisories/new` +- As many details as possible should be entered such as versions affected, CVE + (if available yet). As more information is discovered, edit and update the + advisory accordingly. +- Use the CVSS calculator to score a severity level. + ![CVSS Calculator](/images/calc.png) +- Add collaborators from codeowners team only (outside members can only be added + after approval from the security team) +- The reporter may be added to the issue to assist with review, but **only + reporters who have contacted the security team using a private channel**. +- Select 'Request CVE' ![Request CVE](/docs/static/img/cve.png) +- The security team / Fix Team create a private temporary fork + ![Security Fork](/docs/static/img/fork.png) +- The Fix team performs all work in a 'security advisory' within its temporary + fork +- CI can be checked locally using the [act](https://github.com/nektos/act) + project +- All communication happens within the security advisory, it is _not_ discussed + in slack channels or non private issues. +- The Fix Team will notify the security team that work on the fix branch is + completed, this can be done by tagging names in the advisory +- The Fix team and the security team will agree on fix release day +- The recommended release time is 4pm UTC on a non-Friday weekday. This means + the announcement will be seen morning Pacific, early evening Europe, and late + evening Asia. + +If the CVSS score is under ~4.0 +([a low severity score](https://www.first.org/cvss/specification-document#i5)) +or the assessed risk is low the Fix Team can decide to slow the release process +down in the face of holidays, developer bandwidth, etc. + +Note: CVSS is convenient but imperfect. Ultimately, the security team has +discretion on classifying the severity of a vulnerability. + +The severity of the bug and related handling decisions must be discussed on in +the security advisory, never in public repos. + +### Fix disclosure process + +With the Fix Development underway, the security team needs to come up with an +overall communication plan for the wider community. This Disclosure process +should begin after the Fix Team has developed a Fix or mitigation so that a +realistic timeline can be communicated to users. + +**Fix release day** (Completed within 1-21 days of Disclosure) + +- The Fix Team will approve the related pull requests in the private temporary + branch of the security advisory +- The security team will merge the security advisory / temporary fork and its + commits into the main branch of the affected repository + ![Security Advisory](docs/images/publish.png) +- The security team will ensure all the binaries are built, signed, publicly + available, and functional. +- The security team will announce the new releases, the CVE number, severity, + and impact, and the location of the binaries to get wide distribution and user + action. As much as possible this announcement should be actionable, and + include any mitigating steps users can take prior to upgrading to a fixed + version. An announcement template is available below. The announcement will be + sent to the following channels: +- A link to fix will be posted to the + [Stacklok Discord Server](https://discord.gg/stacklok) in the #toolhive + channel. + +## Retrospective + +These steps should be completed 1-3 days after the Release Date. The +retrospective process +[should be blameless](https://landing.google.com/sre/book/chapters/postmortem-culture.html). + +- The security team will send a retrospective of the process to the + [Stacklok Discord Server](https://discord.gg/stacklok) including details on + everyone involved, the timeline of the process, links to relevant PRs that + introduced the issue, if relevant, and any critiques of the response and + release process. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..64137a8 --- /dev/null +++ b/README.md @@ -0,0 +1,264 @@ +# Fetch MCP Server + +An SSE Go implementation of the `fetch` MCP server that retrieves web content. + +## Features + +- **Web Content Retrieval**: Fetches URLs and extracts textual content +- **Content Extraction**: Extract main content from web pages +- **Robots.txt Compliance**: Respects robots.txt rules (can be disabled) +- **Configurable**: Supports custom user agents and proxy settings + +## Why _this_ fetch and not `mcp/fetch`? + +This Go implementation provides similar functionality as the original [Python MCP fetch server](https://github.com/modelcontextprotocol/servers/tree/main/src/fetch) but has the following benefits: + +- Lower memory usage +- Faster startup time / shutdown time +- Single binary deployment - making tool poisoning attacks harder than in interpreted languages +- Better concurrent request handling +- Better container security [than original](https://github.com/modelcontextprotocol/servers/blob/main/src/fetch/Dockerfile) + - Non root user + - Distroless / minimal image + - Container signing with build provenance +- SSE not STDIO +- More test coverage + +## Prerequisites + +- Go 1.24 or later +- [Task](https://taskfile.dev/) for running tasks + +## Installation + +1. Clone the repository: + +```bash +git clone https://github.com/StacklokLabs/fetch.git +cd fetch +``` + +2. Install dependencies: + +```bash +task install +``` + +3. Build the server: + +```bash +task build +``` + +## Usage + +### Running the server + +To run the server with the default kubeconfig: + +```bash +task run +``` + +The server will start and expose: +- SSE endpoint: `http://localhost:8080/sse` +- Message endpoint: `http://localhost:8080/message` + +#### Command Line Options + +- `--addr`: Address to listen on (default: ":8080", can be set via MCP_PORT env var) +- `--user-agent`: Custom User-Agent string (default: "Mozilla/5.0 (compatible; MCPFetchBot/1.0)") +- `--ignore-robots-txt`: Ignore robots.txt rules +- `--proxy-url`: Proxy URL for requests + +#### Examples + +```bash +# Basic server on port 8080 +./fetch-mcp-server --addr :8080 + +# Custom port with user agent +./fetch-mcp-server --addr :3000 --user-agent "MyBot/1.0" + +# Ignore robots.txt on custom port +./fetch-mcp-server --addr :8080 --ignore-robots-txt + +# Use proxy +./fetch-mcp-server --addr :8080 --proxy-url "http://proxy.example.com:8080" + +# Use environment variable for port +MCP_PORT=9090 ./fetch-mcp-server +``` + +### Docker Usage + +```bash +# Build Docker image +docker build -t fetch-mcp-server . + +# Run with default settings +docker run -p 8080:8080 fetch-mcp-server + +# Run with custom arguments +docker run -p 9090:9090 fetch-mcp-server --addr :9090 +``` + +### Testing the Server + +```bash +# Send MCP request +curl -X POST http://localhost:8080/message \ + -H "Content-Type: application/json" \ + -d '{"jsonrpc": "2.0", "id": 1, "method": "tools/list"}' + +# Test SSE connection +curl -N http://localhost:8080/sse +``` + +## MCP Tools + +The server provides a single tool called `fetch` with the following parameters: + +### Tool: `fetch` + +Fetches a URL from the internet and optionally extracts its contents as markdown. + +#### Parameters + +- `url` (required): The URL to fetch +- `max_length` (optional): Maximum number of characters to return (default: 5000, max: 1000000) +- `start_index` (optional): Starting character index for content extraction (default: 0) +- `raw` (optional): Return raw HTML content without simplification (default: false) + +#### Examples + +```json +{ + "name": "fetch", + "arguments": { + "url": "https://example.com" + } +} +``` + +```json +{ + "name": "fetch", + "arguments": { + "url": "https://example.com", + "max_length": 1000, + "raw": false + } +} +``` + +## Development + +### Running tests + +```bash +task test +``` + +### Formatting code + +```bash +task fmt +``` + +### Linting code + +```bash +task lint +``` + +### Updating dependencies + +```bash +task deps +``` + +## Running as an MCP Server with ToolHive + +fetch can be run as a Model Context Protocol (MCP) server using [ToolHive](https://github.com/stacklok/toolhive), which simplifies the deployment and management of MCP servers. + +### Prerequisites + +1. Install ToolHive by following the [installation instructions](https://github.com/stacklok/toolhive#installation). +2. Ensure you have Docker or Podman installed on your system. + +### Running fetch with ToolHive (Recommended) + +The easiest way to run fetch is using the packaged version available in ToolHive's registry: + +```bash +# Enable auto-discovery to automatically configure supported clients +thv config auto-discovery true + +# Run the fetch server +thv run fetch --transport sse + +# List running servers +thv list + +# Get detailed information about the server +thv registry info fetch +``` + +### Advanced Usage with Custom Configuration + +For advanced users who need custom configuration, you can also run fetch using the container image directly: + +```bash +# Run the fetch server using the published container image +thv run --name fetch --transport sse --target-port 8080 ghcr.io/stackloklabs/fetch/server:latest +``` + +This command: +- Names the server instance "fetch" +- Uses the SSE transport protocol +- Uses the latest published fetch image from GitHub Container Registry + +To use a specific version instead of the latest: + +```bash +thv run --name fetch --transport sse --target-port 8080 ghcr.io/stackloklabs/fetch/server:v0.0.1 +``` + +### Managing the fetch Server + +To verify that the fetch server is running: + +```bash +thv list +``` + +This will show all running MCP servers managed by ToolHive, including the fetch server. + +To stop the fetch server: + +```bash +# For custom named version +thv stop fetch +``` + +To remove the server instance completely: + +```bash +# For custom named version +thv rm fetch +``` + +## Contributing + +We welcome contributions to this MCP server! If you'd like to contribute, please review +the [CONTRIBUTING guide](./CONTRIBUTING.md) for details on how to get started. + +If you run into a bug or have a feature request, please +[open an issue](https://github.com/StacklokLabs/fetch/issues) in the +repository or join us in the `#mcp-servers` channel on our +[community Discord server](https://discord.gg/stacklok). + +## License + +This project is licensed under the Apache v2 License - see the LICENSE file for details. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..e98c401 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,156 @@ +# Security Policy + +The ToolHive community take security seriously! We appreciate your efforts to +disclose your findings responsibly and will make every effort to acknowledge +your contributions. + +## Reporting a vulnerability + +To report a security issue, please use the GitHub Security Advisory +["Report a Vulnerability"](https://github.com/StacklokLabs/toolhive/security/advisories/new) +tab. + +If you are unable to access GitHub you can also email us at +[security@stacklok.com](mailto:security@stacklok.com). + +Include steps to reproduce the vulnerability, the vulnerable versions, and any +additional files to reproduce the vulnerability. + +If you are only comfortable sharing under GPG, please start by sending an email +requesting a public PGP key to use for encryption. + +### Contacting the ToolHive security team + +Contact the team by sending email to +[security@stacklok.com](mailto:security@stacklok.com). + +## Disclosures + +### Private disclosure processes + +The ToolHive community asks that all suspected vulnerabilities be handled in +accordance with +[Responsible Disclosure model](https://en.wikipedia.org/wiki/Responsible_disclosure). + +### Public disclosure processes + +If anyone knows of a publicly disclosed security vulnerability please +IMMEDIATELY email [security@stacklok.com](mailto:security@stacklok.com) to +inform us about the vulnerability so that we may start the patch, release, and +communication process. + +If a reporter contacts the us to express intent to make an issue public before a +fix is available, we will request if the issue can be handled via a private +disclosure process. If the reporter denies the request, we will move swiftly +with the fix and release process. + +## Patch, release, and public communication + +For each vulnerability, the ToolHive security team will coordinate to create the +fix and release, and notify the rest of the community. + +All of the timelines below are suggestions and assume a Private Disclosure. + +- The security team drives the schedule using their best judgment based on + severity, development time, and release work. +- If the security team is dealing with a Public Disclosure all timelines become + ASAP. +- If the fix relies on another upstream project's disclosure timeline, that will + adjust the process as well. +- We will work with the upstream project to fit their timeline and best protect + ToolHive users. +- The Security team will give advance notice to the Private Distributors list + before the fix is released. + +### Fix team organization + +These steps should be completed within the first 24 hours of Disclosure. + +- The security team will work quickly to identify relevant engineers from the + affected projects and packages and being those engineers into the + [security advisory](https://docs.github.com/en/code-security/security-advisories/) + thread. +- These selected developers become the "Fix Team" (the fix team is often drawn + from the projects MAINTAINERS) + +### Fix development process + +These steps should be completed within the 1-7 days of Disclosure. + +- Create a new + [security advisory](https://docs.github.com/en/code-security/security-advisories/) + in affected repository by visiting + `https://github.com/StacklokLabs/toolhive/security/advisories/new` +- As many details as possible should be entered such as versions affected, CVE + (if available yet). As more information is discovered, edit and update the + advisory accordingly. +- Use the CVSS calculator to score a severity level. + ![CVSS Calculator](/images/calc.png) +- Add collaborators from codeowners team only (outside members can only be added + after approval from the security team) +- The reporter may be added to the issue to assist with review, but **only + reporters who have contacted the security team using a private channel**. +- Select 'Request CVE' ![Request CVE](/docs/static/img/cve.png) +- The security team / Fix Team create a private temporary fork + ![Security Fork](/docs/static/img/fork.png) +- The Fix team performs all work in a 'security advisory' within its temporary + fork +- CI can be checked locally using the [act](https://github.com/nektos/act) + project +- All communication happens within the security advisory, it is _not_ discussed + in slack channels or non private issues. +- The Fix Team will notify the security team that work on the fix branch is + completed, this can be done by tagging names in the advisory +- The Fix team and the security team will agree on fix release day +- The recommended release time is 4pm UTC on a non-Friday weekday. This means + the announcement will be seen morning Pacific, early evening Europe, and late + evening Asia. + +If the CVSS score is under ~4.0 +([a low severity score](https://www.first.org/cvss/specification-document#i5)) +or the assessed risk is low the Fix Team can decide to slow the release process +down in the face of holidays, developer bandwidth, etc. + +Note: CVSS is convenient but imperfect. Ultimately, the security team has +discretion on classifying the severity of a vulnerability. + +The severity of the bug and related handling decisions must be discussed on in +the security advisory, never in public repos. + +### Fix disclosure process + +With the Fix Development underway, the security team needs to come up with an +overall communication plan for the wider community. This Disclosure process +should begin after the Fix Team has developed a Fix or mitigation so that a +realistic timeline can be communicated to users. + +**Fix release day** (Completed within 1-21 days of Disclosure) + +- The Fix Team will approve the related pull requests in the private temporary + branch of the security advisory +- The security team will merge the security advisory / temporary fork and its + commits into the main branch of the affected repository + ![Security Advisory](docs/images/publish.png) +- The security team will ensure all the binaries are built, signed, publicly + available, and functional. +- The security team will announce the new releases, the CVE number, severity, + and impact, and the location of the binaries to get wide distribution and user + action. As much as possible this announcement should be actionable, and + include any mitigating steps users can take prior to upgrading to a fixed + version. An announcement template is available below. The announcement will be + sent to the following channels: +- A link to fix will be posted to the + [Stacklok Discord Server](https://discord.gg/stacklok) in the #toolhive + channel. + +## Retrospective + +These steps should be completed 1-3 days after the Release Date. The +retrospective process +[should be blameless](https://landing.google.com/sre/book/chapters/postmortem-culture.html). + +- The security team will send a retrospective of the process to the + [Stacklok Discord Server](https://discord.gg/stacklok) including details on + everyone involved, the timeline of the process, links to relevant PRs that + introduced the issue, if relevant, and any critiques of the response and + release process. \ No newline at end of file diff --git a/Taskfile.yml b/Taskfile.yml new file mode 100644 index 0000000..dfed3c7 --- /dev/null +++ b/Taskfile.yml @@ -0,0 +1,60 @@ +version: '3' + +vars: + BINARY_NAME: fetch-server + BUILD_DIR: build + MAIN_PACKAGE: ./cmd/server + +tasks: + default: + desc: Run tests and build the application + deps: [test, build] + + build: + desc: Build the application + cmds: + - mkdir -p {{.BUILD_DIR}} + - go build -o {{.BUILD_DIR}}/{{.BINARY_NAME}} {{.MAIN_PACKAGE}} + + run: + desc: Run the application + deps: [build] + cmds: + - ./{{.BUILD_DIR}}/{{.BINARY_NAME}} + + lint: + desc: Run linting tools + cmds: + - golangci-lint run ./... + - go vet ./... + + lint-fix: + desc: Run linting tools, and apply fixes. + cmds: + - golangci-lint run --fix ./... + + test: + desc: Run tests + cmds: + - go test -v ./... + + clean: + desc: Clean the build directory + cmds: + - rm -rf {{.BUILD_DIR}} + + fmt: + desc: Format the code + cmds: + - go fmt ./... + - golangci-lint run --fix + + deps: + desc: Update dependencies + cmds: + - go mod tidy + + install: + desc: Install dependencies + cmds: + - go mod download diff --git a/cmd/server/main.go b/cmd/server/main.go new file mode 100644 index 0000000..7395b67 --- /dev/null +++ b/cmd/server/main.go @@ -0,0 +1,39 @@ +// Package main is the entry point for the fetch MCP server. +package main + +import ( + "context" + "log" + + "github.com/chrisburns/fetch-mcp-server/pkg/config" + "github.com/chrisburns/fetch-mcp-server/pkg/server" +) + +func main() { + // Parse configuration + cfg := config.ParseFlags() + + // Create context for clean shutdown + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Create and configure server + fs := server.NewFetchServer(cfg) + + // Start server + serverErrCh := make(chan error, 1) + go func() { + if err := fs.Start(); err != nil { + log.Printf("Server error: %v", err) + serverErrCh <- err + } + }() + + // Wait for error or shutdown signal + select { + case err := <-serverErrCh: + log.Fatalf("Server failed to start: %v", err) + case <-ctx.Done(): + log.Println("Shutdown signal received") + } +} diff --git a/dco.md b/dco.md new file mode 100644 index 0000000..ec97b99 --- /dev/null +++ b/dco.md @@ -0,0 +1,35 @@ +# Developer Certificate of Origin (DCO) +In order to contribute to the project, you must agree to the Developer Certificate of Origin. A [Developer Certificate of Origin (DCO)](https://developercertificate.org/) +is an affirmation that the developer contributing the proposed changes has the necessary rights to submit those changes. +A DCO provides some additional legal protections while being relatively easy to do. + +The entire DCO can be summarized as: +- Certify that the submitted code can be submitted under the open source license of the project (e.g. Apache 2.0) +- I understand that what I am contributing is public and will be redistributed indefinitely + + +## How to Use Developer Certificate of Origin +In order to contribute to the project, you must agree to the Developer Certificate of Origin. To confirm that you agree, your commit message must include a Signed-off-by trailer at the bottom of the commit message. + +For example, it might look like the following: +```bash +A commit message + +Closes gh-345 + +Signed-off-by: jane marmot +``` + +The Signed-off-by [trailer](https://git-scm.com/docs/git-interpret-trailers) can be added automatically by using the [-s or –signoff command line option](https://git-scm.com/docs/git-commit/2.13.7#Documentation/git-commit.txt--s) when specifying your commit message: +```bash +git commit -s -m +``` +If you have chosen the [Keep my email address private](https://docs.github.com/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-email-preferences/setting-your-commit-email-address#about-commit-email-addresses) option within GitHub, the Signed-off-by trailer might look something like: +```bash +A commit message + +Closes gh-345 + +Signed-off-by: jane marmot <462403+jmarmot@users.noreply.github.com> +``` + diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..7d26fb6 --- /dev/null +++ b/go.mod @@ -0,0 +1,23 @@ +module github.com/chrisburns/fetch-mcp-server + +go 1.23.2 + +toolchain go1.24.0 + +require ( + github.com/JohannesKaufmann/html-to-markdown v1.4.2 + github.com/go-shiori/go-readability v0.0.0-20231029095239-6b97d5aba789 + github.com/mark3labs/mcp-go v0.31.0 + golang.org/x/net v0.18.0 +) + +require ( + github.com/PuerkitoBio/goquery v1.8.1 // indirect + github.com/andybalholm/cascadia v1.3.2 // indirect + github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65 // indirect + github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/spf13/cast v1.7.1 // indirect + github.com/yosida95/uritemplate/v3 v3.0.2 // indirect + golang.org/x/text v0.14.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..613c20a --- /dev/null +++ b/go.sum @@ -0,0 +1,112 @@ +github.com/JohannesKaufmann/html-to-markdown v1.4.2 h1:Jt3i/2l98+yOb5uD0ovoIGwccF4DfNxBeUye4P5KP9g= +github.com/JohannesKaufmann/html-to-markdown v1.4.2/go.mod h1:AwPLQeuGhVGKyWXJR8t46vR0iL1d3yGuembj8c1VcJU= +github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM= +github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= +github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY= +github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= +github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= +github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= +github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= +github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65 h1:zx4B0AiwqKDQq+AgqxWeHwbbLJQeidq20hgfP+aMNWI= +github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65/go.mod h1:NPO1+buE6TYOWhUI98/hXLHHJhunIpXRuvDN4xjkCoE= +github.com/go-shiori/go-readability v0.0.0-20231029095239-6b97d5aba789 h1:G6wSuUyCoLB9jrUokipsmFuRi8aJozt3phw/g9Sl4Xs= +github.com/go-shiori/go-readability v0.0.0-20231029095239-6b97d5aba789/go.mod h1:2DpZlTJO/ycxp/vsc/C11oUyveStOgIXB88SYV1lncI= +github.com/gogs/chardet v0.0.0-20191104214054-4b6791f73a28/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14= +github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs= +github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14= +github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= +github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/mark3labs/mcp-go v0.31.0 h1:4UxSV8aM770OPmTvaVe/b1rA2oZAjBMhGBfUgOGut+4= +github.com/mark3labs/mcp-go v0.31.0/go.mod h1:rXqOudj/djTORU/ThxYx8fqEVj/5pvTuuebQ2RC7uk4= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= +github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= +github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y= +github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI= +github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= +github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ= +github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= +github.com/spf13/cast v1.7.1 h1:cuNEagBQEHWN1FnbGEjCXL2szYEXqfJPbP2HNUaca9Y= +github.com/spf13/cast v1.7.1/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/yosida95/uritemplate/v3 v3.0.2 h1:Ed3Oyj9yrmi9087+NczuL5BwkIc4wvTb5zIM+UJPGz4= +github.com/yosida95/uritemplate/v3 v3.0.2/go.mod h1:ILOh0sOhIJR3+L/8afwt/kE++YT040gmv5BQTMR2HP4= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +github.com/yuin/goldmark v1.6.0 h1:boZcn2GTjpsynOsC0iJHnBWa4Bi0qzfJjthwauItG68= +github.com/yuin/goldmark v1.6.0/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.15.0/go.mod h1:4ChreQoLWfG3xLDer1WdlH5NdlQ3+mwnQq1YTKY+72g= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20210505214959-0714010a04ed/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.18.0 h1:mIYleuAkSbHh0tCv7RvjL3F6ZVbLjq4+R7zbOn3Kokg= +golang.org/x/net v0.18.0/go.mod h1:/czyP5RqHAH4odGYxBJ1qz0+CE5WZ+2j1YgoEo8F2jQ= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.14.0/go.mod h1:TySc+nGkYR6qt8km8wUhuFRTVSMIX3XPR58y2lC8vww= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/pkg/config/config.go b/pkg/config/config.go new file mode 100644 index 0000000..05a04f0 --- /dev/null +++ b/pkg/config/config.go @@ -0,0 +1,68 @@ +// Package config provides server configuration functionality. +package config + +import ( + "flag" + "fmt" + "log" + "os" + "strconv" +) + +// Constants +const ( + DefaultPort = ":8080" + ServerName = "fetch-server" + ServerVersion = "1.0.0" + DefaultUA = "Mozilla/5.0 (compatible; MCPFetchBot/1.0)" +) + +// Config holds the server configuration +type Config struct { + Address string + UserAgent string + IgnoreRobots bool + ProxyURL string +} + +// ParseFlags parses command line flags and returns configuration +func ParseFlags() Config { + var config Config + + addr := flag.String("addr", GetDefaultAddress(), "Address to listen on") + flag.StringVar(&config.UserAgent, "user-agent", "", "Custom User-Agent string") + flag.BoolVar(&config.IgnoreRobots, "ignore-robots-txt", false, "Ignore robots.txt rules") + flag.StringVar(&config.ProxyURL, "proxy-url", "", "Proxy URL for requests") + flag.Parse() + + config.Address = *addr + + // Set default user agent if not provided + if config.UserAgent == "" { + config.UserAgent = DefaultUA + } + + return config +} + +// GetDefaultAddress returns the default server address from environment or constant +func GetDefaultAddress() string { + portEnv := os.Getenv("MCP_PORT") + if portEnv == "" { + return DefaultPort + } + + port, err := strconv.Atoi(portEnv) + if err != nil { + log.Printf("Invalid port number in MCP_PORT environment variable: %v, using default port 8080", err) + return DefaultPort + } + + // Validate port range + if port < 1 || port > 65535 { + log.Printf("Port %d out of valid range (1-65535), using default port", port) + return DefaultPort + } + + return fmt.Sprintf(":%d", port) +} diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go new file mode 100644 index 0000000..0d9dc3f --- /dev/null +++ b/pkg/config/config_test.go @@ -0,0 +1,48 @@ +package config + +import ( + "testing" +) + +func TestGetDefaultAddress(t *testing.T) { + tests := []struct { + name string + envValue string + expected string + }{ + { + name: "no environment variable", + envValue: "", + expected: DefaultPort, + }, + { + name: "valid port", + envValue: "9090", + expected: ":9090", + }, + { + name: "invalid port - non-numeric", + envValue: "abc", + expected: DefaultPort, + }, + { + name: "invalid port - out of range", + envValue: "70000", + expected: DefaultPort, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Set environment variable + if tt.envValue != "" { + t.Setenv("MCP_PORT", tt.envValue) + } + + result := GetDefaultAddress() + if result != tt.expected { + t.Errorf("expected %q, got %q", tt.expected, result) + } + }) + } +} diff --git a/pkg/fetcher/fetcher.go b/pkg/fetcher/fetcher.go new file mode 100644 index 0000000..8711b21 --- /dev/null +++ b/pkg/fetcher/fetcher.go @@ -0,0 +1,115 @@ +// Package fetcher provides HTTP content fetching and processing functionality. +package fetcher + +import ( + "fmt" + "io" + "log" + "net/http" + "strings" + + "github.com/chrisburns/fetch-mcp-server/pkg/processor" + "github.com/chrisburns/fetch-mcp-server/pkg/robots" +) + +// HTTPFetcher handles HTTP requests and content retrieval +type HTTPFetcher struct { + httpClient *http.Client + robotsChecker *robots.Checker + processor *processor.ContentProcessor + userAgent string +} + +// NewHTTPFetcher creates a new HTTP fetcher instance +func NewHTTPFetcher( + httpClient *http.Client, + robotsChecker *robots.Checker, + contentProcessor *processor.ContentProcessor, + userAgent string, +) *HTTPFetcher { + return &HTTPFetcher{ + httpClient: httpClient, + robotsChecker: robotsChecker, + processor: contentProcessor, + userAgent: userAgent, + } +} + +// FetchRequest holds the parameters for a fetch request +type FetchRequest struct { + URL string + MaxLength *int + StartIndex *int + Raw bool +} + +// FetchURL retrieves and processes content from the specified URL +func (f *HTTPFetcher) FetchURL(req *FetchRequest) (string, error) { + log.Printf("Fetching URL: %s", req.URL) + + // Check robots.txt + if !f.robotsChecker.IsAllowed(req.URL) { + log.Printf("Access denied by robots.txt for URL: %s", req.URL) + return "", fmt.Errorf("access to %s is disallowed by robots.txt", req.URL) + } + + // Fetch the content + content, err := f.fetchURL(req.URL, req.Raw) + if err != nil { + return "", err + } + + // Apply formatting + formattedContent := f.processor.FormatContent(content, req.StartIndex, req.MaxLength) + + log.Printf("Fetch completed successfully for %s, returning %d characters", req.URL, len(formattedContent)) + return formattedContent, nil +} + +// fetchURL retrieves content from the specified URL +func (f *HTTPFetcher) fetchURL(url string, raw bool) (string, error) { + // Create HTTP request + req, err := http.NewRequest("GET", url, nil) + if err != nil { + log.Printf("Failed to create HTTP request for %s: %v", url, err) + return "", fmt.Errorf("failed to create request: %v", err) + } + + // Set headers + req.Header.Set("User-Agent", f.userAgent) + req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + + // Make HTTP request + resp, err := f.httpClient.Do(req) + if err != nil { + log.Printf("HTTP request failed for %s: %v", url, err) + return "", fmt.Errorf("failed to fetch URL: %v", err) + } + defer resp.Body.Close() + + log.Printf("HTTP %d response from %s (Content-Type: %s)", resp.StatusCode, url, resp.Header.Get("Content-Type")) + + // Check status code + if resp.StatusCode != http.StatusOK { + log.Printf("Non-200 status code %d for %s: %s", resp.StatusCode, url, resp.Status) + return "", fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status) + } + + // Read response body + body, err := io.ReadAll(resp.Body) + if err != nil { + log.Printf("Failed to read response body from %s: %v", url, err) + return "", fmt.Errorf("failed to read response body: %v", err) + } + + log.Printf("Successfully fetched %d bytes from %s", len(body), url) + + content := string(body) + + // Process HTML if not raw mode + if !raw && strings.Contains(resp.Header.Get("Content-Type"), "text/html") { + content = f.processor.ProcessHTML(content) + } + + return content, nil +} diff --git a/pkg/fetcher/fetcher_test.go b/pkg/fetcher/fetcher_test.go new file mode 100644 index 0000000..df6cb10 --- /dev/null +++ b/pkg/fetcher/fetcher_test.go @@ -0,0 +1,166 @@ +package fetcher + +import ( + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/chrisburns/fetch-mcp-server/pkg/processor" + "github.com/chrisburns/fetch-mcp-server/pkg/robots" +) + +// createMockServer creates a test HTTP server with various endpoints +func createMockServer() *httptest.Server { + mux := http.NewServeMux() + + // HTML content endpoint + mux.HandleFunc("/html", func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "text/html") + w.Write([]byte(`

Test Page

This is a test page.

`)) + }) + + // JSON content endpoint + mux.HandleFunc("/json", func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.Write([]byte(`{"message": "Hello, World!", "status": "ok"}`)) + }) + + // robots.txt endpoint + mux.HandleFunc("/robots.txt", func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "text/plain") + robotsContent := `User-agent: * +Disallow: /private/ +Disallow: /admin/ + +User-agent: TestBot +Disallow: /blocked/` + w.Write([]byte(robotsContent)) + }) + + // Error endpoint + mux.HandleFunc("/error", func(w http.ResponseWriter, _ *http.Request) { + http.Error(w, "Internal Server Error", http.StatusInternalServerError) + }) + + return httptest.NewServer(mux) +} + +func createTestFetcher() *HTTPFetcher { + client := &http.Client{Timeout: 5 * time.Second} + robotsChecker := robots.NewChecker("TestBot/1.0", false, client) + contentProcessor := processor.NewContentProcessor() + + return NewHTTPFetcher(client, robotsChecker, contentProcessor, "TestBot/1.0") +} + +func TestNewHTTPFetcher(t *testing.T) { + client := &http.Client{Timeout: 5 * time.Second} + robotsChecker := robots.NewChecker("TestBot/1.0", false, client) + contentProcessor := processor.NewContentProcessor() + userAgent := "TestBot/1.0" + + fetcher := NewHTTPFetcher(client, robotsChecker, contentProcessor, userAgent) + + if fetcher.httpClient != client { + t.Error("expected httpClient to be set correctly") + } + + if fetcher.robotsChecker != robotsChecker { + t.Error("expected robotsChecker to be set correctly") + } + + if fetcher.processor != contentProcessor { + t.Error("expected processor to be set correctly") + } + + if fetcher.userAgent != userAgent { + t.Errorf("expected userAgent %q, got %q", userAgent, fetcher.userAgent) + } +} + +func TestFetchURL(t *testing.T) { + server := createMockServer() + defer server.Close() + + fetcher := createTestFetcher() + + tests := []struct { + name string + request *FetchRequest + expectError bool + expectedLen int // approximate length check + }{ + { + name: "successful HTML fetch", + request: &FetchRequest{ + URL: server.URL + "/html", + Raw: false, + }, + expectError: false, + expectedLen: 10, // Should have some content after markdown conversion + }, + { + name: "successful JSON fetch", + request: &FetchRequest{ + URL: server.URL + "/json", + Raw: true, + }, + expectError: false, + expectedLen: 30, // JSON content length + }, + { + name: "server error", + request: &FetchRequest{ + URL: server.URL + "/error", + Raw: false, + }, + expectError: true, + expectedLen: 0, + }, + { + name: "blocked by robots.txt", + request: &FetchRequest{ + URL: server.URL + "/blocked/page", + Raw: false, + }, + expectError: true, + expectedLen: 0, + }, + { + name: "fetch with formatting", + request: &FetchRequest{ + URL: server.URL + "/json", + Raw: true, + MaxLength: intPtr(20), + }, + expectError: false, + expectedLen: 20, // Should be truncated + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := fetcher.FetchURL(tt.request) + + if tt.expectError && err == nil { + t.Error("expected error but got none") + } + + if !tt.expectError && err != nil { + t.Errorf("unexpected error: %v", err) + } + + if !tt.expectError { + if len(result) < tt.expectedLen { + t.Errorf("expected result length >= %d, got %d", tt.expectedLen, len(result)) + } + } + }) + } +} + +// intPtr returns a pointer to an int +func intPtr(i int) *int { + return &i +} diff --git a/pkg/processor/processor.go b/pkg/processor/processor.go new file mode 100644 index 0000000..b86ef96 --- /dev/null +++ b/pkg/processor/processor.go @@ -0,0 +1,69 @@ +// Package processor handles content processing and formatting. +package processor + +import ( + "strings" + + md "github.com/JohannesKaufmann/html-to-markdown" + "github.com/go-shiori/go-readability" + "golang.org/x/net/html" +) + +// ContentProcessor handles HTML processing and content formatting +type ContentProcessor struct { + htmlConverter *md.Converter +} + +// NewContentProcessor creates a new content processor instance +func NewContentProcessor() *ContentProcessor { + converter := md.NewConverter("", true, nil) + return &ContentProcessor{ + htmlConverter: converter, + } +} + +// ProcessHTML converts HTML content to readable markdown +func (p *ContentProcessor) ProcessHTML(htmlContent string) string { + // Parse HTML document + doc, err := html.Parse(strings.NewReader(htmlContent)) + if err != nil { + return htmlContent + } + + // Extract readable content using readability + article, err := readability.FromDocument(doc, nil) + if err == nil && article.Content != "" { + htmlContent = article.Content + } + + // Convert to markdown + markdown, err := p.htmlConverter.ConvertString(htmlContent) + if err != nil { + return htmlContent + } + + return markdown +} + +// FormatContent applies pagination and truncation to content +func (*ContentProcessor) FormatContent(content string, startIndex, maxLength *int) string { + // Apply start index offset + start := 0 + if startIndex != nil { + start = *startIndex + } + + if start > len(content) { + start = len(content) + } + + content = content[start:] + + // Apply length limit + if maxLength != nil && len(content) > *maxLength { + content = content[:*maxLength] + content += "\n\n[Content truncated. Use start_index to get more content.]" + } + + return content +} diff --git a/pkg/processor/processor_test.go b/pkg/processor/processor_test.go new file mode 100644 index 0000000..a2cec90 --- /dev/null +++ b/pkg/processor/processor_test.go @@ -0,0 +1,124 @@ +package processor + +import ( + "testing" +) + +func TestNewContentProcessor(t *testing.T) { + processor := NewContentProcessor() + + if processor.htmlConverter == nil { + t.Error("expected htmlConverter to be initialized") + } +} + +func TestFormatContent(t *testing.T) { + processor := NewContentProcessor() + + tests := []struct { + name string + content string + startIndex *int + maxLength *int + expected string + }{ + { + name: "no formatting", + content: "Hello, World!", + startIndex: nil, + maxLength: nil, + expected: "Hello, World!", + }, + { + name: "with start index", + content: "Hello, World!", + startIndex: intPtr(7), + maxLength: nil, + expected: "World!", + }, + { + name: "with max length", + content: "Hello, World!", + startIndex: nil, + maxLength: intPtr(5), + expected: "Hello\n\n[Content truncated. Use start_index to get more content.]", + }, + { + name: "with start index and max length", + content: "Hello, World!", + startIndex: intPtr(7), + maxLength: intPtr(3), + expected: "Wor\n\n[Content truncated. Use start_index to get more content.]", + }, + { + name: "start index beyond content length", + content: "Hello", + startIndex: intPtr(10), + maxLength: nil, + expected: "", + }, + { + name: "max length larger than content", + content: "Hello", + startIndex: nil, + maxLength: intPtr(100), + expected: "Hello", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := processor.FormatContent(tt.content, tt.startIndex, tt.maxLength) + if result != tt.expected { + t.Errorf("expected %q, got %q", tt.expected, result) + } + }) + } +} + +func TestProcessHTML(t *testing.T) { + processor := NewContentProcessor() + + tests := []struct { + name string + input string + expected string + }{ + { + name: "simple HTML", + input: "

Title

Content

", + expected: "# Title\n\nContent", // This is an approximation - actual output may vary + }, + { + name: "invalid HTML", + input: "not html content", + expected: "not html content", + }, + { + name: "empty HTML", + input: "", + expected: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := processor.ProcessHTML(tt.input) + + // For HTML processing, we'll just check that we get some output + // The exact markdown conversion may vary between library versions + if tt.input == "" && result != "" { + t.Errorf("expected empty result for empty input, got %q", result) + } + + if tt.input != "" && result == "" { + t.Error("expected non-empty result for non-empty input") + } + }) + } +} + +// intPtr returns a pointer to an int +func intPtr(i int) *int { + return &i +} diff --git a/pkg/robots/robots.go b/pkg/robots/robots.go new file mode 100644 index 0000000..fa98391 --- /dev/null +++ b/pkg/robots/robots.go @@ -0,0 +1,103 @@ +// Package robots provides robots.txt validation functionality. +package robots + +import ( + "fmt" + "io" + "net/http" + "net/url" + "regexp" + "strings" +) + +// Checker handles robots.txt validation for web crawling +type Checker struct { + userAgent string + ignoreRobots bool + httpClient *http.Client +} + +// NewChecker creates a new robots.txt checker +func NewChecker(userAgent string, ignoreRobots bool, httpClient *http.Client) *Checker { + return &Checker{ + userAgent: userAgent, + ignoreRobots: ignoreRobots, + httpClient: httpClient, + } +} + +// IsAllowed checks if the URL can be accessed according to robots.txt +func (c *Checker) IsAllowed(targetURL string) bool { + if c.ignoreRobots { + return true + } + + parsedURL, err := url.Parse(targetURL) + if err != nil { + return false + } + + robotsContent, err := c.fetchRobotsContent(parsedURL) + if err != nil { + // If we can't fetch robots.txt, allow access + return true + } + + return c.parseRobotsRules(robotsContent, parsedURL.Path) +} + +// fetchRobotsContent retrieves the robots.txt file for a given URL +func (c *Checker) fetchRobotsContent(parsedURL *url.URL) (string, error) { + robotsURL := fmt.Sprintf("%s://%s/robots.txt", parsedURL.Scheme, parsedURL.Host) + + req, err := http.NewRequest("GET", robotsURL, nil) + if err != nil { + return "", err + } + + req.Header.Set("User-Agent", c.userAgent) + + resp, err := c.httpClient.Do(req) + if err != nil || resp.StatusCode != 200 { + return "", fmt.Errorf("failed to fetch robots.txt") + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return "", err + } + + return string(body), nil +} + +// parseRobotsRules parses robots.txt content and checks if access is allowed +func (c *Checker) parseRobotsRules(robotsContent, targetPath string) bool { + userAgentPattern := regexp.MustCompile(`(?i)^User-agent:\s*(.+)$`) + disallowPattern := regexp.MustCompile(`(?i)^Disallow:\s*(.*)$`) // Allow empty disallow rules + + lines := strings.Split(robotsContent, "\n") + var currentUserAgents []string + + for _, line := range lines { + line = strings.TrimSpace(line) + + if userAgentMatch := userAgentPattern.FindStringSubmatch(line); userAgentMatch != nil { + userAgent := strings.TrimSpace(userAgentMatch[1]) + if userAgent == "*" || strings.Contains(c.userAgent, userAgent) { + currentUserAgents = append(currentUserAgents, userAgent) + } + } else if disallowMatch := disallowPattern.FindStringSubmatch(line); disallowMatch != nil && len(currentUserAgents) > 0 { + disallowPath := strings.TrimSpace(disallowMatch[1]) + // Empty disallow means allow everything for this user agent + if disallowPath == "" { + continue + } + if disallowPath == "/" || strings.HasPrefix(targetPath, disallowPath) { + return false + } + } + } + + return true +} diff --git a/pkg/robots/robots_test.go b/pkg/robots/robots_test.go new file mode 100644 index 0000000..e571b7f --- /dev/null +++ b/pkg/robots/robots_test.go @@ -0,0 +1,161 @@ +package robots + +import ( + "net/http" + "net/http/httptest" + "testing" + "time" +) + +// createMockRobotsServer creates a test HTTP server for robots.txt testing +func createMockRobotsServer() *httptest.Server { + mux := http.NewServeMux() + + // robots.txt endpoint + mux.HandleFunc("/robots.txt", func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "text/plain") + robotsContent := `User-agent: * +Disallow: /private/ +Disallow: /admin/ + +User-agent: TestBot +Disallow: /blocked/` + w.Write([]byte(robotsContent)) + }) + + return httptest.NewServer(mux) +} + +func TestNewChecker(t *testing.T) { + client := &http.Client{Timeout: 5 * time.Second} + checker := NewChecker("TestBot/1.0", false, client) + + if checker.userAgent != "TestBot/1.0" { + t.Errorf("expected userAgent %q, got %q", "TestBot/1.0", checker.userAgent) + } + + if checker.ignoreRobots != false { + t.Errorf("expected ignoreRobots %v, got %v", false, checker.ignoreRobots) + } + + if checker.httpClient != client { + t.Error("expected httpClient to be set correctly") + } +} + +func TestIsAllowed(t *testing.T) { + server := createMockRobotsServer() + defer server.Close() + + client := &http.Client{Timeout: 5 * time.Second} + + tests := []struct { + name string + targetURL string + ignoreRobots bool + userAgent string + expected bool + }{ + { + name: "ignore robots enabled", + targetURL: server.URL + "/anything", + ignoreRobots: true, + userAgent: "TestBot/1.0", + expected: true, + }, + { + name: "allowed path", + targetURL: server.URL + "/public/page", + ignoreRobots: false, + userAgent: "TestBot/1.0", + expected: true, + }, + { + name: "disallowed path for all user agents", + targetURL: server.URL + "/private/secret", + ignoreRobots: false, + userAgent: "TestBot/1.0", + expected: false, + }, + { + name: "disallowed path for specific user agent", + targetURL: server.URL + "/blocked/page", + ignoreRobots: false, + userAgent: "TestBot/1.0", + expected: false, + }, + { + name: "URL with no robots.txt allows access", + targetURL: "http://nonexistent-host-12345.invalid/page", + ignoreRobots: false, + userAgent: "TestBot/1.0", + expected: true, // Can't fetch robots.txt, so allow access + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + checker := NewChecker(tt.userAgent, tt.ignoreRobots, client) + result := checker.IsAllowed(tt.targetURL) + if result != tt.expected { + t.Errorf("expected %v, got %v", tt.expected, result) + } + }) + } +} + +func TestParseRobotsRules(t *testing.T) { + client := &http.Client{Timeout: 5 * time.Second} + checker := NewChecker("TestBot/1.0", false, client) + + tests := []struct { + name string + robotsContent string + targetPath string + expected bool + }{ + { + name: "allowed path", + robotsContent: `User-agent: * +Disallow: /private/`, + targetPath: "/public/page", + expected: true, + }, + { + name: "disallowed path", + robotsContent: `User-agent: * +Disallow: /private/`, + targetPath: "/private/secret", + expected: false, + }, + { + name: "root disallow", + robotsContent: `User-agent: * +Disallow: /`, + targetPath: "/anything", + expected: false, + }, + { + name: "general disallow applies to all", + robotsContent: `User-agent: * +Disallow: /anything`, + targetPath: "/anything", + expected: false, + }, + { + name: "empty robots.txt", + robotsContent: "", + targetPath: "/anything", + expected: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := checker.parseRobotsRules(tt.robotsContent, tt.targetPath) + if result != tt.expected { + t.Errorf("expected %v, got %v", tt.expected, result) + } + }) + } +} diff --git a/pkg/server/server.go b/pkg/server/server.go new file mode 100644 index 0000000..ebdca90 --- /dev/null +++ b/pkg/server/server.go @@ -0,0 +1,156 @@ +// Package server provides the MCP server implementation for fetching web content. +package server + +import ( + "context" + "fmt" + "log" + "net/http" + "net/url" + "time" + + "github.com/chrisburns/fetch-mcp-server/pkg/config" + "github.com/chrisburns/fetch-mcp-server/pkg/fetcher" + "github.com/chrisburns/fetch-mcp-server/pkg/processor" + "github.com/chrisburns/fetch-mcp-server/pkg/robots" + "github.com/mark3labs/mcp-go/mcp" + "github.com/mark3labs/mcp-go/server" +) + +// FetchServer represents the MCP server for fetching web content +type FetchServer struct { + config config.Config + fetcher *fetcher.HTTPFetcher + mcpServer *server.MCPServer +} + +// NewFetchServer creates a new fetch server instance +func NewFetchServer(cfg config.Config) *FetchServer { + // Create HTTP client with timeout + client := &http.Client{ + Timeout: 30 * time.Second, + } + + // Configure proxy if provided + if cfg.ProxyURL != "" { + if proxyURLParsed, err := url.Parse(cfg.ProxyURL); err == nil { + client.Transport = &http.Transport{ + Proxy: http.ProxyURL(proxyURLParsed), + } + } + } + + // Create components + robotsChecker := robots.NewChecker(cfg.UserAgent, cfg.IgnoreRobots, client) + contentProcessor := processor.NewContentProcessor() + httpFetcher := fetcher.NewHTTPFetcher(client, robotsChecker, contentProcessor, cfg.UserAgent) + + // Create MCP server + mcpServer := server.NewMCPServer(config.ServerName, config.ServerVersion) + + fs := &FetchServer{ + config: cfg, + fetcher: httpFetcher, + mcpServer: mcpServer, + } + + // Setup tools + fs.setupTools() + + return fs +} + +// setupTools registers the fetch tool with the MCP server +func (fs *FetchServer) setupTools() { + fetchTool := mcp.NewTool("fetch", + mcp.WithDescription("Fetches a URL from the internet and optionally extracts its contents as markdown."), + mcp.WithString("url", + mcp.Required(), + mcp.Description("URL to fetch"), + mcp.Pattern("^https?://.*"), + ), + mcp.WithNumber("max_length", + mcp.Description("Maximum number of characters to return."), + ), + mcp.WithNumber("start_index", + mcp.Description("Start index for truncated content."), + ), + mcp.WithBoolean("raw", + mcp.Description("Get the actual HTML content of the requested page, without simplification."), + ), + ) + + fs.mcpServer.AddTool(fetchTool, fs.handleFetchTool) +} + +// handleFetchTool processes fetch tool requests +func (fs *FetchServer) handleFetchTool(_ context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) { + log.Printf("Tool call received: %s", request.Params.Name) + + // Parse request parameters + fetchReq, err := fs.parseFetchRequest(request) + if err != nil { + log.Printf("Tool call failed - %v", err) + return mcp.NewToolResultError(err.Error()), nil + } + + // Fetch the content + content, err := fs.fetcher.FetchURL(fetchReq) + if err != nil { + return mcp.NewToolResultError(err.Error()), nil + } + + return mcp.NewToolResultText(content), nil +} + +// parseFetchRequest extracts and validates parameters from the MCP request +func (*FetchServer) parseFetchRequest(request mcp.CallToolRequest) (*fetcher.FetchRequest, error) { + // Extract URL parameter (required) + urlParam, err := request.RequireString("url") + if err != nil { + return nil, fmt.Errorf("URL is required") + } + + // Extract optional parameters + maxLength := request.GetInt("max_length", 0) + startIndex := request.GetInt("start_index", 0) + raw := request.GetBool("raw", false) + + fetchReq := &fetcher.FetchRequest{ + URL: urlParam, + Raw: raw, + } + + if maxLength > 0 { + fetchReq.MaxLength = &maxLength + } + + if startIndex > 0 { + fetchReq.StartIndex = &startIndex + } + + return fetchReq, nil +} + +// Start starts the MCP server +func (fs *FetchServer) Start() error { + fs.logServerStartup() + + sseServer := server.NewSSEServer(fs.mcpServer) + return sseServer.Start(fs.config.Address) +} + +// logServerStartup prints startup information +func (fs *FetchServer) logServerStartup() { + log.Printf("=== Starting MCP Fetch Server ===") + log.Printf("Server address: %s", fs.config.Address) + log.Printf("User agent: %s", fs.config.UserAgent) + log.Printf("Ignore robots.txt: %v", fs.config.IgnoreRobots) + if fs.config.ProxyURL != "" { + log.Printf("Using proxy: %s", fs.config.ProxyURL) + } + log.Printf("Available tools: fetch") + log.Printf("SSE endpoint: http://localhost%s/sse", fs.config.Address) + log.Printf("Message endpoint: http://localhost%s/message", fs.config.Address) + log.Printf("=== Server starting ===") +}