meedan · skyemeedan · Jan 14, 2025 · Jan 7, 2025 · Jan 7, 2025 · Jan 7, 2025
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
@@ -32,7 +32,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        language: [ 'python', 'ruby' ]
+        language: [ 'python' ]
         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
         # Learn more about CodeQL language support at https://git.io/codeql-language-support
 
@@ -42,7 +42,7 @@ jobs:
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@v1
+      uses: github/codeql-action/init@v3
       with:
         languages: ${{ matrix.language }}
         # If you wish to specify custom queries, you can do so here or in a config file.
@@ -53,7 +53,7 @@ jobs:
     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
     # If this step fails, then you should remove it and run the build manually (see below)
     - name: Autobuild
-      uses: github/codeql-action/autobuild@v1
+      uses: github/codeql-action/autobuild@v3
 
     # ℹ️ Command-line programs to run using the OS shell.
     # 📚 https://git.io/JvXDl
@@ -67,4 +67,4 @@ jobs:
     #   make release
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v1
+      uses: github/codeql-action/analyze@v3
diff --git a/Dockerfile b/Dockerfile
@@ -3,8 +3,7 @@ WORKDIR /app
 
 # Install dependencies
 ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && apt-get install -y ffmpeg cmake swig libavcodec-dev libavformat-dev
-RUN ln -s /usr/bin/ffmpeg /usr/local/bin/ffmpeg
+RUN apt-get update && apt-get install -y cmake swig
 
 
 # Other configurations

@@ -20,9 +20,6 @@ run_model:
 run_rq_worker:
 	while true; do python manage.py run_rq_worker; done
 
-run_video_matcher:
-	while true; do python manage.py run_video_matcher; done
-
 test: wait
 	python manage.py init_perl_functions
 	coverage run --source=app/main/ manage.py test

@@ -29,12 +29,10 @@ class Config:
   MODEL_NAME = os.getenv('MODEL_NAME')
   MAX_CLAUSE_COUNT = 1000
   PERSISTENT_DISK_PATH = os.getenv('PERSISTENT_DISK_PATH', '/app/persistent_disk')
-  VIDEO_MODEL = os.getenv('VIDEO_MODEL', 'video-model')
   try:
     VIDEO_MODEL_L1_SCORE = float(os.getenv('video_model_l1_score', '0.7'))
   except:
     VIDEO_MODEL_L1_SCORE = 0.7
-  AUDIO_MODEL = os.getenv('AUDIO_MODEL', 'audio-model')
   IMAGE_MODEL = os.getenv('IMAGE_MODEL', default='phash')
   OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', default=None)
   ALEGRE_HOST = os.getenv('ALEGRE_HOST', default="http://alegre:3100")

@@ -25,7 +25,6 @@ def get(self):
             'image/similarity': ['phash'],
             'image/ocr': ['google'],
             'audio/transcription': ['aws'],
-            'audio/similarity': ['hash'],
             'video/similarity': ['tmk'],
             'graph/cluster': [],
         }

@@ -27,10 +27,10 @@ def model_response_package(graph, url, doc_id):
   }
 
 def audio_model():
-  return SharedModel.get_client(app.config['AUDIO_MODEL'])
+  return SharedModel.get_client()
 
 def video_model():
-  return SharedModel.get_client(app.config['VIDEO_MODEL'])
+  return SharedModel.get_client()
 
 def get_iterable_objects(graph, data_type):
   try:

@@ -116,21 +116,11 @@ class HybridLangidProvider:
   def langid(text):
     fasttext_result = FastTextLangidProvider.langid(text)
     cld_result = Cld3LangidProvider.langid(text)
-    # max_confidence = max(fasttext_result['result']['confidence'], cld_result['result']['confidence'])
-    min_confidence = min(fasttext_result['result']['confidence'], cld_result['result']['confidence'])
-
-    # if fasttext_result['result']['language'] == cld_result['result']['language'] or max_confidence >= 0.8:
-    if fasttext_result['result']['language'] == cld_result['result']['language'] and min_confidence >= 0.9:
-      # OLD - FastText and CLD agree or one of them is more than 80% confident.
-      # Now - FastText and CLD agree AND BOTH are more than 90% confident
-      # Return the higher confidence result
-      # if fasttext_result['result']['language'] != cld_result['result']['language']:
-      #   # Log when there is disagreement
-      #   app.logger.info(json.dumps({
-      #     'service':'LangId',
-      #     'message': 'Disagreement between fasttext and cld. Returning higher confidence model',
-      #     'parameters':{'text':text, 'fasttext':fasttext_result, 'cld':cld_result,},
-      #     }))
+    #current strategy: Both CLD3 and FastText must be have non-null confidence scores, agree on language tag, and BOTH of them is more than "Threshold" confident. Reference ticket CV2-5367
+    if fasttext_result['result']['confidence'] is not None and cld_result['result']['confidence'] is not None \
+            and fasttext_result['result']['language'] == cld_result['result']['language'] \
+            and  min(fasttext_result['result']['confidence'], cld_result['result']['confidence']) >= 0.7:
+
       if fasttext_result['result']['confidence'] > cld_result['result']['confidence']:
         return fasttext_result
       else:

@@ -13,7 +13,6 @@
 import numpy as np
 from sqlalchemy.orm.exc import NoResultFound
 
-from app.main.lib.shared_models.shared_model import SharedModel
 from app.main.lib.helpers import context_matches
 from app.main.lib.similarity_helpers import get_context_query, drop_context_from_record
 from app.main.lib import media_crud
@@ -24,7 +23,7 @@
 def _after_log(retry_state):
   app.logger.debug("Retrying audio similarity...")
 
-class AudioModel(SharedModel):
+class AudioModel():
     def delete(self, task):
         return media_crud.delete(task, Audio)
 

@@ -14,7 +14,6 @@
 import tmkpy
 from sqlalchemy.orm.exc import NoResultFound
 
-from app.main.lib.shared_models.shared_model import SharedModel
 from app.main.lib.similarity_helpers import get_context_query, drop_context_from_record
 from app.main.lib.helpers import context_matches
 from app.main.lib import media_crud
@@ -28,7 +27,7 @@
 def _after_log(retry_state):
   app.logger.debug("Retrying video similarity...")
 
-class VideoModel(SharedModel):
+class VideoModel():
     def overload_context_to_denote_content_type(self, task):
         return {**task, **{"context": {**task.get("context", {}), **{"content_type": "video"}}}}
 

@@ -75,10 +75,10 @@ def get_body_for_text_document(params, mode):
     return params
 
 def audio_model():
-  return AudioModel(app.config['AUDIO_MODEL'])
+  return AudioModel()
 
 def video_model():
-  return VideoModel(app.config['VIDEO_MODEL'])
+  return VideoModel()
 
 def model_response_package(item, command):
   response_package = {

@@ -16,7 +16,7 @@ class TestAsyncSimilarityBlueprint(BaseTestCase):
     def setUp(self):
         super().setUp()
         first_print = [-248655731, -231870068, -230690420, -482429284, -478234963, -503476625, -520316369, -521361138, 1634511886, 1647109134, 1647046702, 1646940206, 1646924078, -500563482, -496367961, -471202139, -474282347, -476481849, -510101945, -510069497, -526854905, -237050874, -251730922, -251792089, -503463131, -513949140, -513949140, -1587752392, -1250138600, -180474360, -181522936, -194113975, -261353745, -253227346, -189264210, -188938850, -251825010, -251861834, -797121369, 1366287511, 1898902657, 1932452993, 1932452993, 1936651425, 1928253859, -491814237, -487750941, -496401919, -500657663, -500657643, -483876315, -517414355, -534219217, -529853138, -521597906, -524744474, -459335514, -255973226, -255973242, 1908283526, 1925055878, 1929249159, 1392390532, 1383981188, 1378656532, 1915527460, 1915527212, 1915528248, 1903135752, 1885837336, 1894160408, -253321943, -253326037, -262747077, -263193126, -262311942, -159482198, -151365974, -152489301, -152554837, -228052277, -232251189, -231202597, -243569493, -253069157, -257238902, -257242230, -521302374, -529751382, -517430614, -482831830, -483884501, -479492807, -534139591, -534190021, -534124501, -513115153, -479590737, -487980369, -486931793, -487062593, -488087363, -513253323, -529931243, -529865723, -521475067, -521475065, -252982986, -253179866, -260519706, -514274074, -472199258, -493164874, -1564809486, -1561472269, -1569918447, -1574116603, -1574113276, -1557204988, -483728380, -517313481, -528802706, -520549138, -1600584530, -1600453442, -1583800134, -1281875782, -1292339717, -1293328695, -1292907831, -1292969380, -1276199332, -504392116, -533941748, -533945844, -517414116, -517410760, -483794904, -496311256, -496351175, -487962599, -470136709, -1577427462, -1598339078, -1600568581, -1600634279, -1330097415, -1325833495, -1317312771, -1275466019, -1293353515, -1297496649, -1293171465, -1301552649, -1305742569, -1557473769, -1607807481, -1603604985, -1595314665, -1595378138, -1603522266, -1603522330, -1606676314, -1606479681, -262794049, -205121403, -225572412, 1921977028, 1921870556, -225678721, -224598210, -226713298, -231886802, -231829186, -248598194, -265641530, -265582649, -265579009, -265554513, -534022993, -521585489, -525845329, -525849169, -257413713, -207016049, -219666481, -228034567, -232229591, -232196807, -232008440, -244654327, -253043191, -253041137, -1268125170, -1272393170, -1272425938, -1271376338, -1267184018, -1531426306, -1514481442, -1497699122, -1497636658, -1493655458, -1502040008, -1503018952, -1506029256, -1489472728, -1525145048, -1541863896, -1542898072, -1538704408, -456451591, -459404918, -459388790, -172701558, -139158390, -156983158, -152723318, -161046278, -164192018, -164175634]
-        self.model = AudioModel('audio')
+        self.model = AudioModel()
 
     def tearDown(self): # done in our pytest fixture after yield
         db.session.remove()

@@ -33,7 +33,7 @@ def setUp(self):
         audio = Audio(chromaprint_fingerprint=first_print, doc_id="blah", url="http://blah.com", context=[{"blah": 1}])
         db.session.add(audio)
         db.session.commit()
-        self.model = AudioModel('audio')
+        self.model = AudioModel()
 
     def tearDown(self): # done in our pytest fixture after yield
         db.session.remove()

@@ -16,7 +16,7 @@ class TestPrestoBlueprint(BaseTestCase):
     def setUp(self):
         super().setUp()
         first_print = [-248655731, -231870068, -230690420, -482429284, -478234963, -503476625, -520316369, -521361138, 1634511886, 1647109134, 1647046702, 1646940206, 1646924078, -500563482, -496367961, -471202139, -474282347, -476481849, -510101945, -510069497, -526854905, -237050874, -251730922, -251792089, -503463131, -513949140, -513949140, -1587752392, -1250138600, -180474360, -181522936, -194113975, -261353745, -253227346, -189264210, -188938850, -251825010, -251861834, -797121369, 1366287511, 1898902657, 1932452993, 1932452993, 1936651425, 1928253859, -491814237, -487750941, -496401919, -500657663, -500657643, -483876315, -517414355, -534219217, -529853138, -521597906, -524744474, -459335514, -255973226, -255973242, 1908283526, 1925055878, 1929249159, 1392390532, 1383981188, 1378656532, 1915527460, 1915527212, 1915528248, 1903135752, 1885837336, 1894160408, -253321943, -253326037, -262747077, -263193126, -262311942, -159482198, -151365974, -152489301, -152554837, -228052277, -232251189, -231202597, -243569493, -253069157, -257238902, -257242230, -521302374, -529751382, -517430614, -482831830, -483884501, -479492807, -534139591, -534190021, -534124501, -513115153, -479590737, -487980369, -486931793, -487062593, -488087363, -513253323, -529931243, -529865723, -521475067, -521475065, -252982986, -253179866, -260519706, -514274074, -472199258, -493164874, -1564809486, -1561472269, -1569918447, -1574116603, -1574113276, -1557204988, -483728380, -517313481, -528802706, -520549138, -1600584530, -1600453442, -1583800134, -1281875782, -1292339717, -1293328695, -1292907831, -1292969380, -1276199332, -504392116, -533941748, -533945844, -517414116, -517410760, -483794904, -496311256, -496351175, -487962599, -470136709, -1577427462, -1598339078, -1600568581, -1600634279, -1330097415, -1325833495, -1317312771, -1275466019, -1293353515, -1297496649, -1293171465, -1301552649, -1305742569, -1557473769, -1607807481, -1603604985, -1595314665, -1595378138, -1603522266, -1603522330, -1606676314, -1606479681, -262794049, -205121403, -225572412, 1921977028, 1921870556, -225678721, -224598210, -226713298, -231886802, -231829186, -248598194, -265641530, -265582649, -265579009, -265554513, -534022993, -521585489, -525845329, -525849169, -257413713, -207016049, -219666481, -228034567, -232229591, -232196807, -232008440, -244654327, -253043191, -253041137, -1268125170, -1272393170, -1272425938, -1271376338, -1267184018, -1531426306, -1514481442, -1497699122, -1497636658, -1493655458, -1502040008, -1503018952, -1506029256, -1489472728, -1525145048, -1541863896, -1542898072, -1538704408, -456451591, -459404918, -459388790, -172701558, -139158390, -156983158, -152723318, -161046278, -164192018, -164175634]
-        self.model = AudioModel('audio')
+        self.model = AudioModel()
 
     def tearDown(self): # done in our pytest fixture after yield
         db.session.remove()

@@ -16,7 +16,7 @@ class TestSyncSimilarityBlueprint(BaseTestCase):
     def setUp(self):
         super().setUp()
         first_print = 49805440634311326
-        self.model = AudioModel('audio')
+        self.model = AudioModel()
 
     def tearDown(self): # done in our pytest fixture after yield
         db.session.remove()

@@ -23,7 +23,7 @@ def respond(self, task):
 class TestVideoSimilarityBlueprint(BaseTestCase):
     def setUp(self):
         super().setUp()
-        self.model = VideoModel('video')
+        self.model = VideoModel()
 
     def test_get_tempfile(self):
         self.assertIsInstance(self.model.get_tempfile(), tempfile._TemporaryFileWrapper)

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -97,30 +97,6 @@ services:
   #     - .env_file
   #   environment:
   #     MODEL_NAME: mdebertav3filipino
-  # video:
-  #   build: .
-  #   platform: linux/x86_64
-  #   command: ["make", "run_model"]
-  #   volumes:
-  #     - ".:/app"
-  #   depends_on:
-  #     - redis
-  #   env_file:
-  #     - .env_file
-  #   environment:
-  #     MODEL_NAME: video
-  # audio:
-  #   build: .
-  #   platform: linux/x86_64
-  #   command: ["make", "run_model"]
-  #   volumes:
-  #     - ".:/app"
-  #   depends_on:
-  #     - redis
-  #   env_file:
-  #     - .env_file
-  #   environment:
-  #     MODEL_NAME: audio
   queue_worker:
     build: .
     platform: linux/x86_64

@@ -260,12 +260,6 @@ def run_model():
     model_config['options']
   )
 
-
-@manager.command
-def run_video_matcher():
-  """Runs the video matcher."""
-  VideoMatcher.start_server()
-
 @manager.command
 def init():
   """Initializes the service."""

diff --git a/production/Dockerfile b/production/Dockerfile
@@ -19,10 +19,8 @@ RUN chmod 755 /opt/bin/*.sh
 
 WORKDIR /app
 
-RUN apt-get update && apt-get install -y ffmpeg cmake swig libavcodec-dev libavformat-dev
-RUN apt-get update && apt-get install -y ffmpeg swig
+RUN apt-get update && apt-get install -y cmake swig
 RUN apt-get clean
-RUN ln -s /usr/bin/ffmpeg /usr/local/bin/ffmpeg
 
 COPY . .