From e1ec4b6a40364d918bc3abcf475df5226161eca5 Mon Sep 17 00:00:00 2001 From: Salman Date: Thu, 20 Feb 2025 15:01:33 +0000 Subject: [PATCH 1/8] fix --- Dockerfile | 20 +- neurons/validator.py | 25 +- omega/utils/config.py | 7 - validator-api/app.py | 54 --- validator-api/check_vali_api.py | 30 +- validator-api/validator_api/score.py | 561 +-------------------------- 6 files changed, 48 insertions(+), 649 deletions(-) diff --git a/Dockerfile b/Dockerfile index ad2baff..91f6bc1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,30 +1,20 @@ -FROM --platform=linux/amd64 nvcr.io/nvidia/cuda:12.1.0-devel-ubuntu20.04 - -ENV DEBIAN_FRONTEND=noninteractive +FROM --platform=linux/amd64 python@sha256:370c586a6ffc8c619e6d652f81c094b34b14b8f2fb9251f092de23f16e299b78 # Install software-properties-common to add repositories -RUN apt-get -y update && apt-get install -y software-properties-common && \ - add-apt-repository ppa:deadsnakes/ppa && \ - apt-get -y update && apt-get install -y \ - python3.10 python3.10-distutils python3.10-venv python3.10-dev \ +RUN apt-get -y update && apt-get install -y \ git libsndfile1 build-essential ffmpeg libpq-dev \ - pkg-config libmysqlclient-dev && \ + pkg-config libmysqlclient-dev curl && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -# Update the symbolic link for python to point to python3.10 -RUN rm /usr/bin/python3 && \ - ln -s /usr/bin/python3.10 /usr/bin/python3 && \ - ln -s /usr/bin/python3.10 /usr/bin/python - WORKDIR /app/ # Install python requirements COPY ./requirements.txt ./requirements.txt COPY ./requirements_api.txt ./requirements_api.txt -RUN python -m ensurepip && python -m pip install --upgrade pip setuptools wheel uv -RUN python -m uv pip install -r requirements_api.txt --prerelease=allow --no-cache-dir +RUN python -m pip install --upgrade pip setuptools wheel uv +RUN python -m uv pip install --no-cache-dir -r requirements_api.txt --prerelease=allow COPY . . RUN python -m pip install -e . --no-cache-dir diff --git a/neurons/validator.py b/neurons/validator.py index d8fc6e2..ea61fbc 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -135,27 +135,12 @@ def __init__(self, config=None): self.load_topics_start = dt.datetime.now() self.all_topics = self.load_topics() - self.imagebind = None + self.imagebind = ImageBind(v2=True) self.load_focus_rewards_start = dt.datetime.now() self.FOCUS_REWARDS_PERCENT = self.load_focus_rewards_percent() self.AUDIO_REWARDS_PERCENT = AUDIO_REWARDS_PERCENT self.YOUTUBE_REWARDS_PERCENT = 1.0 - self.FOCUS_REWARDS_PERCENT - self.AUDIO_REWARDS_PERCENT - if not self.config.neuron.decentralization.off: - if torch.cuda.is_available(): - bt.logging.info( - f"Running with decentralization enabled, thank you Bittensor Validator!") - self.decentralization = True - self.imagebind = ImageBind(v2=True) - else: - bt.logging.warning( - f"Attempting to run decentralization, but no GPU found. Please see min_compute.yml for minimum resource requirements.") - self.decentralization = False - else: - bt.logging.warning( - "Running with --decentralization.off. It is strongly recommended to run with decentralization enabled.") - self.decentralization = False - def new_wandb_run(self): # Shoutout SN13 for the wandb snippet! """Creates a new wandb run to save information to.""" @@ -344,13 +329,7 @@ async def forward(self): # Adjust the scores based on responses from miners. try: - # Check if this validator is running decentralization - if not self.decentralization: - # if not, use validator API get_rewards system - rewards_list = await self.get_rewards(input_synapse=input_synapse, responses=finished_responses) - else: - # if so, use decentralization logic with local GPU - rewards_list = await self.handle_checks_and_rewards_youtube(input_synapse=input_synapse, responses=finished_responses) + rewards_list = await self.handle_checks_and_rewards_youtube(input_synapse=input_synapse, responses=finished_responses) except Exception as e: bt.logging.error( f"Error in handle_checks_and_rewards_youtube: {e}") diff --git a/omega/utils/config.py b/omega/utils/config.py index b68b8ea..c6c8cb2 100644 --- a/omega/utils/config.py +++ b/omega/utils/config.py @@ -107,13 +107,6 @@ def add_args(cls, parser): default=False, ) - parser.add_argument( - "--neuron.decentralization.off", - action="store_true", - help="Disable decentralization (not recommended).", - default=False, - ) - parser.add_argument( "--neuron.focus_videos", action="store_true", diff --git a/validator-api/app.py b/validator-api/app.py index 1d6366a..f5c78c6 100644 --- a/validator-api/app.py +++ b/validator-api/app.py @@ -42,7 +42,6 @@ from validator_api.communex.client import CommuneClient from validator_api.communex._common import get_node_url from omega.protocol import Videos, VideoMetadata, AudioMetadata -from validator_api.imagebind_loader import ImageBindLoader import aiohttp from validator_api.config import ( NETWORK, NETUID, PORT, @@ -84,7 +83,6 @@ def connect_to_db(): focus_api_key_header = APIKeyHeader(name="FOCUS_API_KEY", auto_error=False) security = HTTPBasic() -imagebind_loader = ImageBindLoader() focus_scoring_service = FocusScoringService() @@ -826,58 +824,6 @@ async def cache_max_focus_alpha(): ################ END OMEGA FOCUS ENDPOINTS ################ - """ TO BE DEPRECATED """ - @app.post("/api/validate") - async def validate( - videos: Videos, - hotkey: Annotated[str, Depends(get_hotkey)], - ) -> float: - if not authenticate_with_bittensor(hotkey, metagraph): - raise HTTPException( - status_code=status.HTTP_403_FORBIDDEN, - detail=f"Valid hotkey required.", - ) - uid = metagraph.hotkeys.index(hotkey) - - start_time = time.time() - - youtube_rewards = await score.score_and_upload_videos(videos, await imagebind_loader.get_imagebind()) - - if youtube_rewards is None: - print("YouTube rewards are empty, returning None") - return None - - total_rewards: float = youtube_rewards - - print(f"Total Rewards: {total_rewards}") - print( - f"Returning score={total_rewards} for validator={uid} in {time.time() - start_time:.2f}s") - - return total_rewards - - if not IS_PROD: - @app.get("/api/count_unique") - async def count_unique( - videos: Videos, - ) -> str: - nunique = await score.get_num_unique_videos(videos) - return f"{nunique} out of {len(videos.video_metadata)} submitted videos are unique" - - @app.get("/api/check_score") - async def check_score( - videos: Videos, - ) -> dict: - detailed_score = await score.score_videos_for_testing(videos, await imagebind_loader.get_imagebind()) - return detailed_score - - @app.get("/api/topic") - async def get_topic() -> str: - return random.choice(TOPICS_LIST) - - @app.get("/api/topics") - async def get_topics() -> List[str]: - return TOPICS_LIST - @app.get("/") async def healthcheck(): return datetime.utcnow() diff --git a/validator-api/check_vali_api.py b/validator-api/check_vali_api.py index 0743df2..e36a1e5 100644 --- a/validator-api/check_vali_api.py +++ b/validator-api/check_vali_api.py @@ -2,15 +2,20 @@ import asyncio import datetime import os +import time +import matplotlib.pyplot as plt API_URL = "https://validator.api.omega-labs.ai" -NUM_REQUESTS = 5 +# API_URL = "http://localhost:8001" +NUM_REQUESTS = 100 SAVE_DIR = "api_logs" async def check_validator_api(): + start = time.time() async with aiohttp.ClientSession() as session: + # async with session.get(f"{API_URL}/api/focus/get_list") as response: async with session.get(f"{API_URL}") as response: - return await response.text() + return await response.text(), time.time() - start async def main(): timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") @@ -21,11 +26,28 @@ async def main(): results = await asyncio.gather(*tasks) end_time = asyncio.get_event_loop().time() + # Extract durations for histogram + durations = [result[1] for result in results] + + # Create histogram + plt.figure(figsize=(10, 6)) + plt.hist(durations, bins=50, edgecolor='black') + plt.title('Distribution of API Request Durations') + plt.xlabel('Duration (seconds)') + plt.ylabel('Frequency') + plt.savefig(f"{SAVE_DIR}/duration_histogram_{timestamp}.png") + plt.close() + with open(output_file, 'w') as f: + f.write(f"API URL: {API_URL}\n") + f.write(f"Number of requests: {NUM_REQUESTS}\n") + f.write(f"Total time taken: {end_time - start_time:.2f} seconds\n") + f.write(f"Average time per request: {(end_time - start_time) / NUM_REQUESTS:.2f} seconds\n") + f.write(f"Max response time: {max(result[1] for result in results):.2f} seconds\n\n\n\n") for i, result in enumerate(results, 1): - f.write(f"Request {i}:\n{result}\n\n") - f.write(f"Total time taken: {end_time - start_time:.2f} seconds") + f.write(f"Request {i}:\n{result[0]}\nTime taken: {result[1]:.2f} seconds\n\n") print(f"Results saved to {output_file}") + print(f"Histogram saved to {SAVE_DIR}/duration_histogram_{timestamp}.png") asyncio.run(main()) diff --git a/validator-api/validator_api/score.py b/validator-api/validator_api/score.py index fcacd69..25cb649 100644 --- a/validator-api/validator_api/score.py +++ b/validator-api/validator_api/score.py @@ -1,50 +1,16 @@ import asyncio -import random import uuid -from typing import List, Tuple, Optional, BinaryIO -import math +from typing import List, Tuple from pinecone import Pinecone import torch import torch.nn.functional as F -import soundfile as sf -from io import BytesIO -from omega.protocol import Videos, VideoMetadata, AudioMetadata, Audios -from omega import video_utils, unstuff -from omega.constants import ( - MAX_VIDEO_LENGTH, - MIN_VIDEO_LENGTH, - DIFFERENCE_THRESHOLD, - SIMILARITY_THRESHOLD, - VIDEO_DOWNLOAD_TIMEOUT, - MIN_SCORE, - FAKE_VIDEO_PUNISHMENT, - QUERY_RELEVANCE_SCALING_FACTOR, - DESCRIPTION_RELEVANCE_SCALING_FACTOR, - VIDEO_RELEVANCE_WEIGHT, - DESCRIPTION_LENGTH_WEIGHT, - MIN_LENGTH_BOOST_TOKEN_COUNT, - MAX_LENGTH_BOOST_TOKEN_COUNT, - STUFFED_DESCRIPTION_PUNISHMENT, - DIARIZATION_SCALING_FACTOR, - AUDIO_LENGTH_SCALING_FACTOR, - AUDIO_QUALITY_SCALING_FACTOR, - AUDIO_QUERY_RELEVANCE_SCALING_FACTOR, - SPEECH_CONTENT_SCALING_FACTOR, - SPEAKER_DOMINANCE_SCALING_FACTOR, - BACKGROUND_NOISE_SCALING_FACTOR, - MAX_AUDIO_LENGTH_SECONDS, - MIN_AUDIO_LENGTH_SECONDS -) -from omega.imagebind_wrapper import ImageBind, Embeddings, run_async, LENGTH_TOKENIZER -from omega.text_similarity import get_text_similarity_score +from omega.protocol import VideoMetadata, AudioMetadata +from omega.constants import DIFFERENCE_THRESHOLD +from omega.imagebind_wrapper import Embeddings, run_async from validator_api import config from validator_api.dataset_upload import video_dataset_uploader, audio_dataset_uploader -from omega.audio_scoring import AudioScore -from omega.diarization_metric import calculate_diarization_metrics - - PINECONE_INDEX = Pinecone(api_key=config.PINECONE_API_KEY).Index(config.PINECONE_INDEX) @@ -115,36 +81,27 @@ async def compute_novelty_score(embeddings: Embeddings) -> Tuple[float, List[boo ]) return novelty_score, is_too_similar - def upload_to_pinecone(embeddings: Embeddings, metadata: List[VideoMetadata]) -> None: video_ids = [str(uuid.uuid4()) for _ in range(len(metadata))] try: PINECONE_INDEX.upsert( - vectors=sum([ - [ - { - "id": f"{modality_type[:3]}{video_uuid}", - "values": emb.tolist(), - "metadata": { - "youtube_id": video.video_id, - "modality_type": modality_type, - } + vectors=[ + { + "id": f"{VIDEO_TYPE[:3]}{video_uuid}", + "values": embedding_vid.tolist(), + "metadata": { + "youtube_id": video.video_id, + "modality_type": VIDEO_TYPE, } - for emb, modality_type - in zip( - [embedding_vid, embedding_aud, embedding_des], - [VIDEO_TYPE, AUDIO_TYPE, DESCRIPTION_TYPE] - ) - ] - for video_uuid, video, embedding_vid, embedding_aud, embedding_des - in zip(video_ids, metadata, embeddings.video, embeddings.audio, embeddings.description) - ], []), + } + for video_uuid, video, embedding_vid + in zip(video_ids, metadata, embeddings.video) + ], ) except Exception as e: print(f"Failed to upload to Pinecone: {e}") return video_ids - def upload_to_pinecone_audio(embeddings: Embeddings, metadata: List[AudioMetadata]) -> None: audio_ids = [str(uuid.uuid4()) for _ in range(len(metadata))] try: @@ -215,491 +172,3 @@ async def upload_audio_metadata( total_score ) return audio_ids - - -def filter_embeddings(embeddings: Embeddings, is_too_similar: List[bool]) -> Embeddings: - """Filter the embeddings based on whether they are too similar to the query.""" - is_too_similar = torch.tensor(is_too_similar) - if embeddings.video is not None: - embeddings.video = embeddings.video[~is_too_similar] - if embeddings.audio is not None: - embeddings.audio = embeddings.audio[~is_too_similar] - if embeddings.description is not None: - embeddings.description = embeddings.description[~is_too_similar] - return embeddings - - -def filter_stuffed_embeddings(embeddings: Embeddings, stuffed: List[Tuple[bool, float]]) -> Embeddings: - """Filter the embeddings based on whether they are too similar to the query.""" - stuffed = torch.tensor([s for s, _ in stuffed]) - if embeddings.video is not None: - embeddings.video = embeddings.video[~stuffed] - if embeddings.audio is not None: - embeddings.audio = embeddings.audio[~stuffed] - if embeddings.description is not None: - embeddings.description = embeddings.description[~stuffed] - return embeddings - -def is_similar(emb_1: torch.Tensor, emb_2: List[float]) -> bool: - return F.cosine_similarity( - emb_1, - torch.tensor(emb_2, device=emb_1.device).unsqueeze(0) - ) > SIMILARITY_THRESHOLD - - -def strict_is_similar(emb_1: torch.Tensor, emb_2: List[float]) -> bool: - return torch.allclose(emb_1, torch.tensor(emb_2, device=emb_1.device), atol=1e-4) - - -def metadata_check(metadata: List[VideoMetadata]) -> List[VideoMetadata]: - return [ - video_metadata for video_metadata in metadata - if ( - video_metadata.end_time - video_metadata.start_time <= MAX_VIDEO_LENGTH and - video_metadata.end_time - video_metadata.start_time >= MIN_VIDEO_LENGTH - ) - ] - - -def audio_metadata_check(metadata: List[AudioMetadata]) -> List[AudioMetadata]: - return [ - audio_metadata for audio_metadata in metadata - if ( - audio_metadata.end_time - audio_metadata.start_time <= MAX_VIDEO_LENGTH and - audio_metadata.end_time - audio_metadata.start_time >= MIN_VIDEO_LENGTH - ) - ] - -def deduplicate_audios(embeddings: Embeddings) -> List[bool]: - # return a list of booleans where True means the corresponding video is a duplicate i.e. is_similar - audio_tensor = embeddings.audio - num_audios = audio_tensor.shape[0] - # cossim = CosineSimilarity(dim=1) - is_similar = [] - for i in range(num_audios): - similarity_score = F.cosine_similarity(audio_tensor[[i]], audio_tensor[i + 1:]).max() - has_duplicates = (similarity_score > SIMILARITY_THRESHOLD).any() - is_similar.append(has_duplicates.item()) - - return is_similar - -def compute_novelty_score_among_batch_audio(emb: Embeddings) -> List[float]: - audio_tensor = emb.audio - num_audios = audio_tensor.shape[0] - novelty_scores = [] - for i in range(num_audios - 1): - similarity_score = F.cosine_similarity(audio_tensor[[i]], audio_tensor[i + 1:]).max() - novelty_scores.append(1 - similarity_score.item()) - novelty_scores.append(1.0) # last video is 100% novel - return novelty_scores - -def get_proxy_url() -> str: - return random.choice(config.PROXY_LIST + [None]) - - -async def get_random_video(metadata: List[VideoMetadata], check_video: bool) -> Optional[Tuple[VideoMetadata, Optional[BinaryIO]]]: - if not check_video: - random_metadata = random.choice(metadata) - return random_metadata, None - - random_video = None - metadata_copy = [v for v in metadata] # list shallow copy - while random_video is None and len(metadata_copy) > 0: - idx = random.randint(0, len(metadata_copy) - 1) - random_metadata = metadata_copy.pop(idx) - try: - async with DOWNLOAD_SEMAPHORE: - random_video = await asyncio.wait_for(run_async( - video_utils.download_youtube_video, - random_metadata.video_id, - random_metadata.start_time, - random_metadata.end_time, - proxy=get_proxy_url(), - ), timeout=VIDEO_DOWNLOAD_TIMEOUT) - except video_utils.IPBlockedException: - # IP is blocked, cannot download video, check description only - print("WARNING: IP is blocked, cannot download video, checking description only") - return random_metadata, None - except video_utils.FakeVideoException: - print(f"WARNING: Video {random_metadata.video_id} is fake, punishing miner") - return None - except asyncio.TimeoutError: - continue - - # IP is not blocked, video is not fake, but video download failed for some reason. We don't - # know why it failed so we won't punish the miner, but we will check the description only. - if random_video is None: - return random_metadata, None - - return random_metadata, random_video - - -async def random_check(random_meta_and_vid: List[VideoMetadata], imagebind: ImageBind) -> bool: - random_metadata, random_video = random_meta_and_vid - - if random_video is None: - desc_embeddings = await imagebind.embed_text_async([random_metadata.description]) - is_similar_ = is_similar(desc_embeddings, random_metadata.description_emb) - strict_is_similar_ = strict_is_similar(desc_embeddings, random_metadata.description_emb) - print(f"Description similarity: {is_similar_}, strict description similarity: {strict_is_similar_}") - return is_similar_ - - # Video downloaded, check all embeddings - embeddings = await imagebind.embed_async([random_metadata.description], [random_video]) - is_similar_ = ( - is_similar(embeddings.video, random_metadata.video_emb) and - is_similar(embeddings.audio, random_metadata.audio_emb) and - is_similar(embeddings.description, random_metadata.description_emb) - ) - strict_is_similar_ = ( - strict_is_similar(embeddings.video, random_metadata.video_emb) and - strict_is_similar(embeddings.audio, random_metadata.audio_emb) and - strict_is_similar(embeddings.description, random_metadata.description_emb) - ) - print(f"Total similarity: {is_similar_}, strict total similarity: {strict_is_similar_}") - return is_similar_ - - -async def get_num_unique_videos(videos: Videos) -> int: - metadata = videos.video_metadata - embeddings = Embeddings( - video=torch.stack([torch.tensor(v.video_emb) for v in metadata]), - audio=torch.stack([torch.tensor(v.audio_emb) for v in metadata]), - description=torch.stack([torch.tensor(v.description_emb) for v in metadata]), - ) - novelty_score, is_too_similar = await compute_novelty_score(embeddings) - return sum([not is_sim for is_sim in is_too_similar]) - - -async def _run_video_scoring(videos: Videos, imagebind: ImageBind, is_check_only: bool) -> float: - - # check video_ids for fake videos - if any(not video_utils.is_valid_youtube_id(video.video_id) for video in videos.video_metadata): - return {"score": FAKE_VIDEO_PUNISHMENT} - - metadata = metadata_check(videos.video_metadata)[:videos.num_videos] - print(f"Filtered {len(videos.video_metadata)} videos down to {len(metadata)} videos") - - # return minimum score if no videos were found in video_metadata - if len(metadata) == 0: - return {"score": MIN_SCORE} - - check_video = config.CHECK_PROBABILITY > random.random() - random_meta_and_vid = await get_random_video(metadata, check_video) - if random_meta_and_vid is None: - return {"score": FAKE_VIDEO_PUNISHMENT} - - async with GPU_SEMAPHORE: - passed_check = await random_check(random_meta_and_vid, imagebind) - if not passed_check: - return {"score": FAKE_VIDEO_PUNISHMENT} - - query_emb = await imagebind.embed_text_async([videos.query]) - - # Upload the videos to Pinecone and deduplicate - original_length = len(metadata) - embeddings = Embeddings( - video=torch.stack([torch.tensor(v.video_emb) for v in metadata]).to(imagebind.device), - audio=torch.stack([torch.tensor(v.audio_emb) for v in metadata]).to(imagebind.device), - description=torch.stack([torch.tensor(v.description_emb) for v in metadata]).to(imagebind.device), - ) - novelty_score, is_too_similar = await compute_novelty_score(embeddings) - embeddings = filter_embeddings(embeddings, is_too_similar) - metadata = [metadata for metadata, too_similar in zip(metadata, is_too_similar) if not too_similar] - print(f"Deduplicated {original_length} videos down to {len(metadata)} videos") - - # Filter out "stuffed" descriptions. - pre_filter_metadata_length = len(metadata) - stuffed = [ - unstuff.is_stuffed(meta.description) - for meta in metadata - ] - if any([garbage and confidence > 0.75 for garbage, confidence in stuffed]): - print("Stuffed description found with high confidence, penalizing the miner.") - return {"score": STUFFED_DESCRIPTION_PUNISHMENT} - - # More stuffing. - extraneous = [ - unstuff.check_extraneous_chunks(meta.description, meta.video_emb, meta.audio_emb, imagebind) - for meta in metadata - ] - for really_bad, low_quality, total in extraneous: - if really_bad > 5 or low_quality >= 16: - print(f"Extraneous garbage found in text check {really_bad=} {low_quality=} {total=}") - return {"score": STUFFED_DESCRIPTION_PUNISHMENT} - - metadata = [ - metadata[idx] - for idx in range(len(metadata)) - if not stuffed[idx][0] - and extraneous[idx][1] <= 15 - and extraneous[idx][2] <= 50 - ] - if len(metadata) < pre_filter_metadata_length: - print(f"Filtering {pre_filter_metadata_length} videos down to {len(metadata)} videos to remove token-stuffed descriptions.") - if len(metadata) == 0: - return {"score": MIN_SCORE} - - embeddings = filter_stuffed_embeddings(embeddings, stuffed) - - # Compute relevance scores - video_description_relevance_scores = F.cosine_similarity( - embeddings.video, embeddings.description - ).tolist() - audio_description_relevance_scores = F.cosine_similarity( - embeddings.audio, embeddings.description - ).tolist() - video_query_relevance_scores = F.cosine_similarity( - embeddings.video, query_emb - ).tolist() - audio_query_relevance_scores = F.cosine_similarity( - embeddings.audio, query_emb - ).tolist() - - # Query relevance score now includes video cosim, audio cosim, and text cosim using higher quality text-only model. - query_relevance_scores = [ - sum([ - video_query_relevance_scores[idx], - audio_query_relevance_scores[idx], - get_text_similarity_score(metadata[idx].description, videos.query), - ]) / 3 - for idx in range(len(video_query_relevance_scores)) - ] - - # Combine audio & visual description scores, weighted towards visual. - description_relevance_scores = [ - sum([ - video_description_relevance_scores[idx] * VIDEO_RELEVANCE_WEIGHT, - audio_description_relevance_scores[idx] * (1.0 - VIDEO_RELEVANCE_WEIGHT), - ]) - for idx in range(len(video_description_relevance_scores)) - ] - - # Scale description scores by number of unique tokens. - length_scalers = [] - for idx in range(len(description_relevance_scores)): - unique_tokens = LENGTH_TOKENIZER(metadata[idx].description) - unique_tokens = set(unique_tokens[unique_tokens != 0][1:-1].tolist()) - unique_token_count = len(unique_tokens) - if unique_token_count <= MIN_LENGTH_BOOST_TOKEN_COUNT: - print(f"Very few tokens, applying {DESCRIPTION_LENGTH_WEIGHT} penalty.") - description_relevance_scores[idx] *= (1.0 - DESCRIPTION_LENGTH_WEIGHT) - length_scalers.append(0) - continue - length_scaler = min(math.log(MAX_LENGTH_BOOST_TOKEN_COUNT, 2), math.log(unique_token_count, 2)) - math.log(MIN_LENGTH_BOOST_TOKEN_COUNT, 2) - length_scaler /= (math.log(MAX_LENGTH_BOOST_TOKEN_COUNT, 2) - math.log(MIN_LENGTH_BOOST_TOKEN_COUNT, 2)) - length_scalers.append(length_scaler) - print(f"Description length scaling factor = {length_scaler}") - description_relevance_scores[idx] -= description_relevance_scores[idx] * DESCRIPTION_LENGTH_WEIGHT * (1.0 - length_scaler) - - # Aggregate scores - score = ( - (sum(description_relevance_scores) * DESCRIPTION_RELEVANCE_SCALING_FACTOR) + - (sum(query_relevance_scores) * QUERY_RELEVANCE_SCALING_FACTOR) - ) / 2 / videos.num_videos - - print(f''' - is_unique: {[not is_sim for is_sim in is_too_similar]}, - video cosine sim: {video_description_relevance_scores}, - audio cosine sim: {audio_description_relevance_scores}, - description relevance scores: {description_relevance_scores}, - query relevance scores: {query_relevance_scores}, - length scalers: {length_scalers}, - total score: {score} - ''') - - if not is_check_only and len(metadata) > 0: - video_ids = await run_async(upload_to_pinecone, embeddings, metadata) - # Schedule upload to HuggingFace - video_dataset_uploader.add_videos( - metadata, - video_ids, - description_relevance_scores, - query_relevance_scores, - videos.query, - ) - score = max(score, MIN_SCORE) - - if score > 0.4: - print(f"Videos with score > 0.4: {metadata}") - - return { - "is_unique": [not is_sim for is_sim in is_too_similar], - "description_relevance_scores": description_relevance_scores, - "query_relevance_scores": query_relevance_scores, - "score": score, - } - - -async def _run_audio_scoring(audios: Audios, imagebind: ImageBind, is_check_only: bool = False) -> float: - """Score audio submissions and optionally upload them. - - Args: - audios: The audio submissions to score - imagebind: ImageBind model for embeddings - is_check_only: If True, only score without uploading - - Returns: - Either the final score (float) or a dict with detailed scoring info - """ - if len(audios.audio_metadata) == 0: - return MIN_SCORE - - # Check for valid YouTube IDs - if any(not video_utils.is_valid_youtube_id(audio.video_id) for audio in audios.audio_metadata): - return FAKE_VIDEO_PUNISHMENT - - - # Check audio metadata and filter out invalid ones - metadata = audio_metadata_check(audios.audio_metadata)[:audios.num_audios] - print(f"Filtered {len(audios.audio_metadata)} audios down to {len(metadata)} audios") - - - # execute the random check on metadata and video - async with GPU_SEMAPHORE: - query_emb = await imagebind.embed_text_async([audios.query]) - - embeddings = Embeddings( - video=None, - audio=torch.stack([torch.tensor(a.audio_emb) for a in metadata]).to(imagebind.device), - description=None - ) - - # check and deduplicate videos based on embedding similarity checks. We do this because we're not uploading to pinecone first. - metadata_is_similar = await deduplicate_audios(embeddings) - metadata = [metadata for metadata, too_similar in zip(metadata, metadata_is_similar) if not too_similar] - embeddings = filter_embeddings(embeddings, metadata_is_similar) - - if len(metadata) < len(audios.audio_metadata): - print(f"Deduplicated {len(audios.audio_metadata)} audios down to {len(metadata)} audios") - - if len(metadata) == 0: - return MIN_SCORE - - # first get local novelty scores - local_novelty_scores = compute_novelty_score_among_batch_audio(embeddings) - pre_filter_metadata_length = len(metadata) - # check scores from index for being too similar - is_too_similar = [score < DIFFERENCE_THRESHOLD for score in local_novelty_scores] - # filter out metadata too similar - metadata = [metadata for metadata, too_similar in zip(metadata, is_too_similar) if not too_similar] - # filter out embeddings too similar - embeddings = filter_embeddings(embeddings, is_too_similar) - if len(metadata) < pre_filter_metadata_length: - print(f"Filtering {pre_filter_metadata_length} audios down to {len(metadata)} audios that are too similar to audios in our index.") - - # return minimum score if no unique videos were found - if len(metadata) == 0: - return MIN_SCORE - - # Filter metadata based on length constraints - metadata = [ - meta for meta in audios.audio_metadata[:audios.num_audios] - if (meta.end_time - meta.start_time) >= MIN_AUDIO_LENGTH_SECONDS - and (meta.end_time - meta.start_time) <= MAX_AUDIO_LENGTH_SECONDS - ] - - if len(metadata) == 0: - return MIN_SCORE - - total_audio_length = sum((meta.end_time - meta.start_time) for meta in metadata) - print(f"Average audio length: {total_audio_length/len(metadata):.2f} seconds") - audio_length_score = total_audio_length/(audios.num_audios*MAX_AUDIO_LENGTH_SECONDS) - - audio_query_score = sum(F.cosine_similarity( - embeddings.audio, query_emb - ).tolist())/len(metadata) - print(f"Audio query score: {audio_query_score}") - - # Randomly sample one audio for duration check - selected_random_meta = random.choice(metadata) - audio_array, sr = sf.read(BytesIO(selected_random_meta.audio_bytes)) - audio_duration = len(audio_array) / sr - print(f"Selected Youtube Video: {selected_random_meta.video_id}, Duration: {audio_duration:.2f} seconds") - - audio_quality_scores = AudioScore().total_score( - audio_array, - sr, - selected_random_meta.diar_timestamps_start, - selected_random_meta.diar_timestamps_end, - selected_random_meta.diar_speakers - ) - audio_quality_total_score = ( - audio_quality_scores["speech_content_score"] * SPEECH_CONTENT_SCALING_FACTOR + - audio_quality_scores["speaker_dominance_score"] * SPEAKER_DOMINANCE_SCALING_FACTOR + - audio_quality_scores["background_noise_score"] * BACKGROUND_NOISE_SCALING_FACTOR - ) - - miner_diar_segment = { - "start": selected_random_meta.diar_timestamps_start, - "end": selected_random_meta.diar_timestamps_end, - "speakers": selected_random_meta.diar_speakers - } - - diarization_score = calculate_diarization_metrics( - audio_array, - sr, - miner_diar_segment - ) - inverse_der = diarization_score["inverse_der"] - total_score = ( - DIARIZATION_SCALING_FACTOR * inverse_der + - AUDIO_LENGTH_SCALING_FACTOR * audio_length_score + - AUDIO_QUALITY_SCALING_FACTOR * audio_quality_total_score + - AUDIO_QUERY_RELEVANCE_SCALING_FACTOR * audio_query_score - ) - - print(f''' - is_unique: {[not is_sim for is_sim in is_too_similar]}, - audio_query_score: {audio_query_score}, - audio_length_score: {audio_length_score}, - audio_quality_score: {audio_quality_total_score}, - diarization_score: {inverse_der}, - total score: {total_score} - ''') - - if not is_check_only and len(metadata) > 0: - # Upload metadata and schedule dataset upload - audio_ids = await run_async(upload_to_pinecone_audio, embeddings, metadata) - - audio_dataset_uploader.add_audios( - metadata, - audio_ids, - inverse_der, - audio_length_score, - audio_quality_total_score, - audio_query_score, - audios.query, - total_score, - ) - total_score = max(total_score, MIN_SCORE) - - if total_score > 0.4: - print(f"Audios with score > 0.4: {metadata}") - - return { - "is_unique": [not is_sim for is_sim in is_too_similar], - "audio_query_score": audio_query_score, - "audio_length_score": audio_length_score, - "audio_quality_score": audio_quality_total_score, - "diarization_score": inverse_der, - "score": total_score - } - - -async def score_videos_for_testing(videos: Videos, imagebind: ImageBind) -> float: - return await _run_video_scoring(videos, imagebind, is_check_only=True) - - -async def score_and_upload_videos(videos: Videos, imagebind: ImageBind) -> float: - scores_dict = await _run_video_scoring(videos, imagebind, is_check_only=False) - return scores_dict["score"] - - -async def score_audios_for_testing(audios: Audios, imagebind: ImageBind) -> float: - return await _run_audio_scoring(audios, imagebind, is_check_only=True) - - -async def score_and_upload_audios(audios: Audios, imagebind: ImageBind) -> float: - scores_dict = await _run_audio_scoring(audios, imagebind, is_check_only=False) - return scores_dict["score"] \ No newline at end of file From dddc6d218308ee473f5b2da20031100f307f5cd8 Mon Sep 17 00:00:00 2001 From: Salman Date: Thu, 20 Feb 2025 15:31:10 +0000 Subject: [PATCH 2/8] fixed --- Dockerfile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 91f6bc1..07fb6ac 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,11 @@ FROM --platform=linux/amd64 python@sha256:370c586a6ffc8c619e6d652f81c094b34b14b8f2fb9251f092de23f16e299b78 -# Install software-properties-common to add repositories -RUN apt-get -y update && apt-get install -y \ +# Install software-properties-common to add repositories. +# Note that mariadb is compatible with mysql which is why we use it +RUN apt-get -y update && apt-get install -y software-properties-common && \ + apt-get -y update && apt-get install -y \ git libsndfile1 build-essential ffmpeg libpq-dev \ - pkg-config libmysqlclient-dev curl && \ + pkg-config libmariadb-dev curl && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* From aeeda1f63017487c772cc19d8f3fcc75ef909d6b Mon Sep 17 00:00:00 2001 From: Salman Date: Thu, 20 Feb 2025 16:47:33 +0000 Subject: [PATCH 3/8] pipelin --- .github/workflows/dev-pipeline.yml | 71 ++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 .github/workflows/dev-pipeline.yml diff --git a/.github/workflows/dev-pipeline.yml b/.github/workflows/dev-pipeline.yml new file mode 100644 index 0000000..e993924 --- /dev/null +++ b/.github/workflows/dev-pipeline.yml @@ -0,0 +1,71 @@ +name: Dev CI/CD Pipeline + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + workflow_dispatch: + +env: + REGION: us-central1-a + REPO_NAME: bittensor + IMAGE_NAME: sn24-vali-api + DEPLOYMENT_NAME: dev-sn24-vali-api + +jobs: + build-and-deploy: + runs-on: ubuntu-latest + environment: development + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Get Git SHA + id: git_sha + run: echo "sha=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT + + - name: Authenticate to Google Cloud + uses: google-github-actions/auth@v1 + with: + credentials_json: ${{ secrets.GCP_SA_KEY }} + + - name: Set up Cloud SDK + uses: google-github-actions/setup-gcloud@v1 + + - name: Configure Docker + run: gcloud auth configure-docker ${{ env.REGION }}-docker.pkg.dev --quiet + + - name: Create .env file + run: | + echo "${{ secrets.ENV_VARIABLES }}" > .env + + - name: Build Docker image + run: | + docker build --platform=linux/amd64 \ + --build-arg APP_DIR=/app \ + -t ${{ env.REGION }}-docker.pkg.dev/${{ secrets.PROJECT_ID }}/${{ env.REPO_NAME }}/${{ env.IMAGE_NAME }}:prod-${{ steps.git_sha.outputs.sha }} . + + - name: Push Docker image + run: | + docker push ${{ env.REGION }}-docker.pkg.dev/${{ secrets.PROJECT_ID }}/${{ env.REPO_NAME }}/${{ env.IMAGE_NAME }}:prod-${{ steps.git_sha.outputs.sha }} + + - name: Get GKE credentials + uses: google-github-actions/get-gke-credentials@v1 + with: + cluster_name: ${{ secrets.DEV_CLUSTER }} + location: ${{ env.REGION }} + + - name: Deploy to Dev + run: | + kubectl set image deployment/${{ env.DEPLOYMENT_NAME }} \ + backend=${{ env.REGION }}-docker.pkg.dev/${{ secrets.PROJECT_ID }}/${{ env.REPO_NAME }}/${{ env.IMAGE_NAME }}:prod-${{ steps.git_sha.outputs.sha }} \ + -n dev + kubectl rollout status deployment/${{ env.DEPLOYMENT_NAME }} -n dev + + - name: Cleanup sensitive files + if: always() + run: | + rm -f .env From 20358e34d05c4e8d3993979d0d3a5a1f95cc0fad Mon Sep 17 00:00:00 2001 From: Salman Date: Thu, 20 Feb 2025 16:57:32 +0000 Subject: [PATCH 4/8] change vali API url to new URL --- neurons/test_miner.py | 2 +- neurons/validator.py | 4 ++-- purchase_focus_video.py | 4 ++-- validator-api/check_vali_api.py | 3 ++- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/neurons/test_miner.py b/neurons/test_miner.py index 169428f..a53f73f 100644 --- a/neurons/test_miner.py +++ b/neurons/test_miner.py @@ -26,7 +26,7 @@ else: videos = Videos(query=query, num_videos=num_videos, video_metadata=video_metadata_list) response = requests.get( - "https://dev-validator.api.omega-labs.ai/api/count_unique", + "https://dev-sn24-api.omegatron.ai/api/count_unique", json=videos.to_serializable_dict(videos) ) print(response.json()) diff --git a/neurons/validator.py b/neurons/validator.py index ea61fbc..1b86054 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -127,9 +127,9 @@ def __init__(self, config=None): self.successfully_started_wandb = False self.api_root = ( - "https://dev-validator.api.omega-labs.ai" + "https://dev-sn24-api.omegatron.ai" if self.config.subtensor.network == "test" else - "https://validator.api.omega-labs.ai" + "https://sn24-api.omegatron.ai" ) # load topics from topics URL (CSV) or fallback to local topics file self.load_topics_start = dt.datetime.now() diff --git a/purchase_focus_video.py b/purchase_focus_video.py index 2d39e89..4c3aadb 100644 --- a/purchase_focus_video.py +++ b/purchase_focus_video.py @@ -76,9 +76,9 @@ SUBTENSOR_NETWORK = None # "test" or None API_BASE = ( - "https://dev-validator.api.omega-labs.ai" + "https://dev-sn24-api.omegatron.ai" if SUBTENSOR_NETWORK == "test" else - "https://validator.api.omega-labs.ai" + "https://sn24-api.omegatron.ai" ) CYAN = "\033[96m" diff --git a/validator-api/check_vali_api.py b/validator-api/check_vali_api.py index e36a1e5..70efdc4 100644 --- a/validator-api/check_vali_api.py +++ b/validator-api/check_vali_api.py @@ -7,7 +7,8 @@ API_URL = "https://validator.api.omega-labs.ai" # API_URL = "http://localhost:8001" -NUM_REQUESTS = 100 +API_URL = "https://sn24-api.omegatron.ai" +NUM_REQUESTS = 1000 SAVE_DIR = "api_logs" async def check_validator_api(): From 9eb81136ace901509d32c1e83a6ddd84b61495de Mon Sep 17 00:00:00 2001 From: Salman Date: Thu, 20 Feb 2025 17:15:02 +0000 Subject: [PATCH 5/8] fixed ci/cd maybe --- .github/workflows/dev-pipeline.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/dev-pipeline.yml b/.github/workflows/dev-pipeline.yml index e993924..021cbd7 100644 --- a/.github/workflows/dev-pipeline.yml +++ b/.github/workflows/dev-pipeline.yml @@ -12,6 +12,7 @@ env: REPO_NAME: bittensor IMAGE_NAME: sn24-vali-api DEPLOYMENT_NAME: dev-sn24-vali-api + DEV_CLUSTER: dev-sn24-vali-api jobs: build-and-deploy: @@ -55,7 +56,7 @@ jobs: - name: Get GKE credentials uses: google-github-actions/get-gke-credentials@v1 with: - cluster_name: ${{ secrets.DEV_CLUSTER }} + cluster_name: ${{ env.DEV_CLUSTER }} location: ${{ env.REGION }} - name: Deploy to Dev From f1825eda48a4adddb25a03cfe8017c9237eb9722 Mon Sep 17 00:00:00 2001 From: Salman Date: Thu, 20 Feb 2025 17:24:37 +0000 Subject: [PATCH 6/8] fix pipeline region var --- .github/workflows/dev-pipeline.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/dev-pipeline.yml b/.github/workflows/dev-pipeline.yml index 021cbd7..1042a13 100644 --- a/.github/workflows/dev-pipeline.yml +++ b/.github/workflows/dev-pipeline.yml @@ -8,7 +8,7 @@ on: workflow_dispatch: env: - REGION: us-central1-a + REGION: us-central1 REPO_NAME: bittensor IMAGE_NAME: sn24-vali-api DEPLOYMENT_NAME: dev-sn24-vali-api @@ -57,7 +57,7 @@ jobs: uses: google-github-actions/get-gke-credentials@v1 with: cluster_name: ${{ env.DEV_CLUSTER }} - location: ${{ env.REGION }} + location: ${{ env.REGION }}-a - name: Deploy to Dev run: | From 7469309a619abf01e63186d8aeef64e3b578929d Mon Sep 17 00:00:00 2001 From: Salman Date: Thu, 20 Feb 2025 17:49:31 +0000 Subject: [PATCH 7/8] fix --- .github/workflows/dev-pipeline.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/dev-pipeline.yml b/.github/workflows/dev-pipeline.yml index 1042a13..cce6a01 100644 --- a/.github/workflows/dev-pipeline.yml +++ b/.github/workflows/dev-pipeline.yml @@ -13,6 +13,7 @@ env: IMAGE_NAME: sn24-vali-api DEPLOYMENT_NAME: dev-sn24-vali-api DEV_CLUSTER: dev-sn24-vali-api + NAMESPACE: dev-sn24-vali-api jobs: build-and-deploy: @@ -63,8 +64,8 @@ jobs: run: | kubectl set image deployment/${{ env.DEPLOYMENT_NAME }} \ backend=${{ env.REGION }}-docker.pkg.dev/${{ secrets.PROJECT_ID }}/${{ env.REPO_NAME }}/${{ env.IMAGE_NAME }}:prod-${{ steps.git_sha.outputs.sha }} \ - -n dev - kubectl rollout status deployment/${{ env.DEPLOYMENT_NAME }} -n dev + -n ${{ env.NAMESPACE }} + kubectl rollout status deployment/${{ env.DEPLOYMENT_NAME }} -n ${{ env.NAMESPACE }} - name: Cleanup sensitive files if: always() From 8526bfea8c110d9333dd3c13222e78c7334b1cc3 Mon Sep 17 00:00:00 2001 From: Salman Date: Thu, 20 Feb 2025 17:59:47 +0000 Subject: [PATCH 8/8] fix pipeline --- .github/workflows/dev-pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/dev-pipeline.yml b/.github/workflows/dev-pipeline.yml index cce6a01..6baf92d 100644 --- a/.github/workflows/dev-pipeline.yml +++ b/.github/workflows/dev-pipeline.yml @@ -63,7 +63,7 @@ jobs: - name: Deploy to Dev run: | kubectl set image deployment/${{ env.DEPLOYMENT_NAME }} \ - backend=${{ env.REGION }}-docker.pkg.dev/${{ secrets.PROJECT_ID }}/${{ env.REPO_NAME }}/${{ env.IMAGE_NAME }}:prod-${{ steps.git_sha.outputs.sha }} \ + ${{ env.IMAGE_NAME }}=${{ env.REGION }}-docker.pkg.dev/${{ secrets.PROJECT_ID }}/${{ env.REPO_NAME }}/${{ env.IMAGE_NAME }}:prod-${{ steps.git_sha.outputs.sha }} \ -n ${{ env.NAMESPACE }} kubectl rollout status deployment/${{ env.DEPLOYMENT_NAME }} -n ${{ env.NAMESPACE }}