From 2ed35efffe16c9af6a1b3e7cc204fb48ff493761 Mon Sep 17 00:00:00 2001
From: Nicolas Beguier <nicolas.beguier@adevinta.com>
Date: Wed, 16 Oct 2019 16:49:23 +0200
Subject: [PATCH 1/2] Display probability of maliciousness

---
 .gitignore               |  3 +++
 clustering/classifier.py |  3 ++-
 clustering/utility.py    | 17 +++++++++++++++++
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 69d62ce..ed88112 100644
--- a/.gitignore
+++ b/.gitignore
@@ -108,3 +108,6 @@ ENV/
 
 # MacOS files
 *.DS_Store
+
+# Models
+Classification/
diff --git a/clustering/classifier.py b/clustering/classifier.py
index cd3148b..857fefa 100644
--- a/clustering/classifier.py
+++ b/clustering/classifier.py
@@ -61,6 +61,7 @@ def test_model(names, labels, attributes, model, print_res=True, print_res_verbo
     # to predict the target values
 
     if print_res:
+        utility.get_proba('malicious', labels_predicted_proba_test)
         utility.get_classification_results(names, labels_predicted_test)
 
     if print_res_verbose:
@@ -71,7 +72,7 @@ def test_model(names, labels, attributes, model, print_res=True, print_res_verbo
     if print_score:
         utility.get_score(labels, labels_predicted_test)
 
-    return labels_predicted_test
+    return labels_predicted_test, labels_predicted_proba_test
 
 
 def parsing_commands():
diff --git a/clustering/utility.py b/clustering/utility.py
index 6341b10..375ce61 100644
--- a/clustering/utility.py
+++ b/clustering/utility.py
@@ -143,6 +143,23 @@ def get_score(labels, labels_predicted):
             logging.exception(error_message)
 
 
+def get_proba(label, labels_predicted_proba):
+    """
+        Print in stdout the probability of the classification.
+
+        -------
+        Parameters:
+        - label: string
+            Contains the proba's name.
+        - labels_predicted_proba: matrix
+            Contains in the first column the probability of the samples being benign,
+            and malicious in the second one.
+
+    """
+    label_id = ['benign', 'malicious'].index(label)
+    print("{}: {} %".format(label, int(labels_predicted_proba[0][label_id]*100)))
+
+
 def get_nb_trees_specific_label(model, attributes, labels, labels_predicted, threshold):
     """
         Get the number of trees which gave the same prediction as the one of the whole forest.

From 19ef0eea82717a705510df8e6898131ae14834e9 Mon Sep 17 00:00:00 2001
From: Nicolas Beguier <nicolas.beguier@adevinta.com>
Date: Wed, 16 Oct 2019 16:50:00 +0200
Subject: [PATCH 2/2] Add classifier_by_lines, which analyse a JS file line by
 line

---
 README.md                         |   1 +
 clustering/classifier_by_lines.py | 101 ++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+)
 create mode 100644 clustering/classifier_by_lines.py

diff --git a/README.md b/README.md
index 5e60fcf..1381e19 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,7 @@ To use this tool:
 1) *python3 \<path-of-clustering/learner.py\> --help*;  
 2) *python3 \<path-of-clustering/updater.py\> --help*;  
 3) *python3 \<path-of-clustering/classifier.py\> --help*.
+4) *python3 \<path-of-clustering/classifier_by_lines.py\> --help*.
 
 - Clustering of JavaScript samples into *k* (configurable) families.   
 To use this tool: *python3 \<path-of-clustering/cluster.py\> --help*.
diff --git a/clustering/classifier_by_lines.py b/clustering/classifier_by_lines.py
new file mode 100644
index 0000000..bf8364c
--- /dev/null
+++ b/clustering/classifier_by_lines.py
@@ -0,0 +1,101 @@
+#!/usr/bin/python
+
+# Standard
+import argparse
+import sys
+
+# JaSt lib
+import utility
+import classifier
+import static_analysis
+from is_js import is_js_file
+
+# Debug
+# from pdb import set_trace as st
+
+def parsing_commands():
+    """
+        Creation of an ArgumentParser object, holding all the information necessary to parse
+        the command line into Python data types.
+
+        -------
+        Returns:
+        - ArgumentParser such as:
+          * js_dirs=arg_obj['d'],
+          * labels_d=arg_obj['l'],
+          * js_files=arg_obj['f'],
+          * labels_f=arg_obj['lf'],
+          * model=arg_obj['m'],
+          * threshold=arg_obj['th'],
+          * tolerance=arg_obj['t'][0],
+          * n=arg_obj['n'][0].
+          A more thorough description can be obtained:
+            >$ python3 <path-of-clustering/classifier.py> -help
+    """
+
+    parser = argparse.ArgumentParser(description='Given a list of directory or file paths,\
+    detects the malicious JS inputs.')
+
+    parser.add_argument('--d', metavar='DIR', type=str, nargs='+',
+                        help='directories containing the JS files to be analyzed')
+    parser.add_argument('--l', metavar='LABEL', type=str, nargs='+',
+                        choices=['benign', 'malicious', '?'],
+                        help='labels of the JS directories to evaluate the model from')
+    parser.add_argument('--f', metavar='FILE', type=str, nargs='+', help='files to be analyzed')
+    parser.add_argument('--lf', metavar='LABEL', type=str, nargs='+',
+                        choices=['benign', 'malicious', '?'],
+                        help='labels of the JS files to evaluate the model from')
+    parser.add_argument('--m', metavar='MODEL', type=str, nargs=1,
+                        help='path of the model used to classify the new JS inputs '
+                             + '(see >$ python3 <path-of-clustering/learner.py> -help) '
+                             + 'to build a model)')
+    parser.add_argument('--th', metavar='THRESHOLD', type=float, nargs=1, default=[0.29],
+                        help='threshold over which all samples are considered malicious')
+    utility.parsing_commands(parser)
+
+    return vars(parser.parse_args())
+
+OPTS = parsing_commands()
+JAVASCRIPT = OPTS['f'][0]
+MODEL = OPTS['m'][0]
+TMPFILENAME = '/tmp/.tmp.js'
+
+def get_malicious_score(js):
+    """
+        Classification of the sub-javascript
+    """
+    names, attributes, labels = static_analysis.main_analysis \
+    (js_files=[js], js_dirs=OPTS['d'], labels_files=OPTS['lf'], labels_dirs=OPTS['l'], \
+        n=OPTS['n'][0], tolerance=OPTS['t'][0], dict_not_hash=OPTS['dnh'][0])
+    if not names:
+        return 0
+    _, labels_predicted_proba = classifier.test_model(names, labels, attributes, \
+        model=MODEL, print_res=False, print_score=False)
+    malicious_proba = int(labels_predicted_proba[0][1]*100)
+    return malicious_proba
+
+def main():
+    """
+        Main function
+    """
+    js = open(JAVASCRIPT, 'r')
+    copy = open(TMPFILENAME, 'w+')
+    line = js.readline()
+    n = 1
+    begin_line = n
+    while line:
+        copy.write(str(line))
+        copy.close()
+        if is_js_file(TMPFILENAME) == 0:
+            score = get_malicious_score(TMPFILENAME)
+            print('Line {} to {}: {}%'.format(begin_line, n, score))
+            begin_line = n + 1
+            copy = open(TMPFILENAME, 'w+')
+        else:
+            copy = open(TMPFILENAME, 'a+')
+        line = js.readline()
+        n += 1
+    js.close()
+
+if __name__ == '__main__':
+    main()