Assign new project by llama index

**#searchEngine**
import os
import nltk
import string
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample documents (you would replace this with your actual document collection)
documents = [
    "Machine learning is the study of computer algorithms that improve automatically through experience.",
    "Natural language processing (NLP) is a field of AI concerned with the interaction between computers and humans.",
    "Deep learning is a subset of machine learning in which artificial neural networks mimic the human brain.",
    "Search engines use algorithms to retrieve documents in response to user queries.",
    "Artificial intelligence (AI) is the simulation of human intelligence by machines.",
]

# Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = nltk.word_tokenize(text.lower())
    # Remove punctuation and stopwords
    tokens = [token for token in tokens if token not in string.punctuation]
    return " ".join(tokens)

# Preprocess each document
preprocessed_documents = [preprocess_text(doc) for doc in documents]

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)

# Function to perform search
def search(query, documents, tfidf_matrix, vectorizer, top_n=1):
    # Preprocess query
    query = preprocess_text(query)
    # Transform query to TF-IDF vector
    query_vector = vectorizer.transform([query])
    # Calculate cosine similarity between query vector and document vectors
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    # Get indices of top similar documents
    top_document_indices = cosine_similarities.argsort()[-top_n:][::-1]
    # Return top documents
    top_documents = [(cosine_similarities[i], documents[i]) for i in top_document_indices]
    return top_documents

# Example usage
query = "machine learning algorithms"
top_results = search(query, documents, tfidf_matrix, vectorizer, top_n=2)

# Print results
print(f"Top results for query '{query}':")
for score, result in top_results:
    print(f"Score: {score:.2f}, Document: {result}")

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Assign new project by llama index #414

Sample documents (you would replace this with your actual document collection)

Preprocessing function

Preprocess each document

TF-IDF vectorization

Function to perform search

Example usage

Print results

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Assign new project by llama index #414

Description

Sample documents (you would replace this with your actual document collection)

Preprocessing function

Preprocess each document

TF-IDF vectorization

Function to perform search

Example usage

Print results

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions