Skip to content

Commit 33c5ce1

Browse files
authored
Update robots.json
Updated robots list with five new proposed AI bots: aiHitBot Cotoyogi Factset_spyderbot FirecrawlAgent TikTokSpider
1 parent 774b1dd commit 33c5ce1

File tree

1 file changed

+36
-1
lines changed

1 file changed

+36
-1
lines changed

robots.json

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,13 @@
1313
"operator": "[Ai2](https://allenai.org/crawler)",
1414
"respect": "Yes"
1515
},
16+
"aiHitBot": {
17+
"operator": "[aiHit](https://www.aihitdata.com/about)",
18+
"respect": "Yes",
19+
"function": "A massive, artificial intelligence/machine learning, automated system.",
20+
"frequency": "No information provided.",
21+
"description": "Scrapes data for AI systems."
22+
},
1623
"Amazonbot": {
1724
"operator": "Amazon",
1825
"respect": "Yes",
@@ -97,6 +104,13 @@
97104
"frequency": "Unclear at this time.",
98105
"description": "cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler"
99106
},
107+
"Cotoyogi": {
108+
"operator": "[ROIS](https://ds.rois.ac.jp/en_center8/en_crawler/)",
109+
"respect": "Yes",
110+
"function": "AI LLM Scraper.",
111+
"frequency": "No information provided.",
112+
"description": "Scrapes data for AI training in Japanese language."
113+
},
100114
"Crawlspace": {
101115
"operator": "[Crawlspace](https://crawlspace.dev)",
102116
"respect": "[Yes](https://news.ycombinator.com/item?id=42756654)",
@@ -125,6 +139,20 @@
125139
"frequency": "Up to 1 page per second",
126140
"description": "Officially used for training Meta \"speech recognition technology,\" unknown if used to train Meta AI specifically."
127141
},
142+
"Factset_spyderbot": {
143+
"operator": "[Factset](https://www.factset.com/ai)",
144+
"respect": "Unclear at this time.",
145+
"function": "AI model training.",
146+
"frequency": "No information provided.",
147+
"description": "Scrapes data for AI training."
148+
},
149+
"FirecrawlAgent": {
150+
"operator": "[Firecrawl](https://www.firecrawl.dev/)",
151+
"respect": "Yes",
152+
"function": "AI scraper and LLM training",
153+
"frequency": "No information provided.",
154+
"description": "Scrapes data for AI systems and LLM training."
155+
},
128156
"FriendlyCrawler": {
129157
"description": "Unclear who the operator is; but data is used for training/machine learning.",
130158
"frequency": "Unclear at this time.",
@@ -321,6 +349,13 @@
321349
"operator": "[Sidetrade](https://www.sidetrade.com)",
322350
"respect": "Unclear at this time."
323351
},
352+
"TikTokSpider": {
353+
"operator": "ByteDance",
354+
"respect": "Unclear at this time.",
355+
"function": "LLM training.",
356+
"frequency": "Unclear at this time.",
357+
"description": "Downloads data to train LLMS, as per Bytespider."
358+
},
324359
"Timpibot": {
325360
"operator": "[Timpi](https://timpi.io)",
326361
"respect": "Unclear at this time.",
@@ -349,4 +384,4 @@
349384
"frequency": "No information.",
350385
"description": "Retrieves data used for You.com web search engine and LLMs."
351386
}
352-
}
387+
}

0 commit comments

Comments
 (0)