From 2ed35efffe16c9af6a1b3e7cc204fb48ff493761 Mon Sep 17 00:00:00 2001 From: Nicolas Beguier Date: Wed, 16 Oct 2019 16:49:23 +0200 Subject: [PATCH 1/2] Display probability of maliciousness --- .gitignore | 3 +++ clustering/classifier.py | 3 ++- clustering/utility.py | 17 +++++++++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 69d62ce..ed88112 100644 --- a/.gitignore +++ b/.gitignore @@ -108,3 +108,6 @@ ENV/ # MacOS files *.DS_Store + +# Models +Classification/ diff --git a/clustering/classifier.py b/clustering/classifier.py index cd3148b..857fefa 100644 --- a/clustering/classifier.py +++ b/clustering/classifier.py @@ -61,6 +61,7 @@ def test_model(names, labels, attributes, model, print_res=True, print_res_verbo # to predict the target values if print_res: + utility.get_proba('malicious', labels_predicted_proba_test) utility.get_classification_results(names, labels_predicted_test) if print_res_verbose: @@ -71,7 +72,7 @@ def test_model(names, labels, attributes, model, print_res=True, print_res_verbo if print_score: utility.get_score(labels, labels_predicted_test) - return labels_predicted_test + return labels_predicted_test, labels_predicted_proba_test def parsing_commands(): diff --git a/clustering/utility.py b/clustering/utility.py index 6341b10..375ce61 100644 --- a/clustering/utility.py +++ b/clustering/utility.py @@ -143,6 +143,23 @@ def get_score(labels, labels_predicted): logging.exception(error_message) +def get_proba(label, labels_predicted_proba): + """ + Print in stdout the probability of the classification. + + ------- + Parameters: + - label: string + Contains the proba's name. + - labels_predicted_proba: matrix + Contains in the first column the probability of the samples being benign, + and malicious in the second one. + + """ + label_id = ['benign', 'malicious'].index(label) + print("{}: {} %".format(label, int(labels_predicted_proba[0][label_id]*100))) + + def get_nb_trees_specific_label(model, attributes, labels, labels_predicted, threshold): """ Get the number of trees which gave the same prediction as the one of the whole forest. From 19ef0eea82717a705510df8e6898131ae14834e9 Mon Sep 17 00:00:00 2001 From: Nicolas Beguier Date: Wed, 16 Oct 2019 16:50:00 +0200 Subject: [PATCH 2/2] Add classifier_by_lines, which analyse a JS file line by line --- README.md | 1 + clustering/classifier_by_lines.py | 101 ++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 clustering/classifier_by_lines.py diff --git a/README.md b/README.md index 5e60fcf..1381e19 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ To use this tool: 1) *python3 \ --help*; 2) *python3 \ --help*; 3) *python3 \ --help*. +4) *python3 \ --help*. - Clustering of JavaScript samples into *k* (configurable) families. To use this tool: *python3 \ --help*. diff --git a/clustering/classifier_by_lines.py b/clustering/classifier_by_lines.py new file mode 100644 index 0000000..bf8364c --- /dev/null +++ b/clustering/classifier_by_lines.py @@ -0,0 +1,101 @@ +#!/usr/bin/python + +# Standard +import argparse +import sys + +# JaSt lib +import utility +import classifier +import static_analysis +from is_js import is_js_file + +# Debug +# from pdb import set_trace as st + +def parsing_commands(): + """ + Creation of an ArgumentParser object, holding all the information necessary to parse + the command line into Python data types. + + ------- + Returns: + - ArgumentParser such as: + * js_dirs=arg_obj['d'], + * labels_d=arg_obj['l'], + * js_files=arg_obj['f'], + * labels_f=arg_obj['lf'], + * model=arg_obj['m'], + * threshold=arg_obj['th'], + * tolerance=arg_obj['t'][0], + * n=arg_obj['n'][0]. + A more thorough description can be obtained: + >$ python3 -help + """ + + parser = argparse.ArgumentParser(description='Given a list of directory or file paths,\ + detects the malicious JS inputs.') + + parser.add_argument('--d', metavar='DIR', type=str, nargs='+', + help='directories containing the JS files to be analyzed') + parser.add_argument('--l', metavar='LABEL', type=str, nargs='+', + choices=['benign', 'malicious', '?'], + help='labels of the JS directories to evaluate the model from') + parser.add_argument('--f', metavar='FILE', type=str, nargs='+', help='files to be analyzed') + parser.add_argument('--lf', metavar='LABEL', type=str, nargs='+', + choices=['benign', 'malicious', '?'], + help='labels of the JS files to evaluate the model from') + parser.add_argument('--m', metavar='MODEL', type=str, nargs=1, + help='path of the model used to classify the new JS inputs ' + + '(see >$ python3 -help) ' + + 'to build a model)') + parser.add_argument('--th', metavar='THRESHOLD', type=float, nargs=1, default=[0.29], + help='threshold over which all samples are considered malicious') + utility.parsing_commands(parser) + + return vars(parser.parse_args()) + +OPTS = parsing_commands() +JAVASCRIPT = OPTS['f'][0] +MODEL = OPTS['m'][0] +TMPFILENAME = '/tmp/.tmp.js' + +def get_malicious_score(js): + """ + Classification of the sub-javascript + """ + names, attributes, labels = static_analysis.main_analysis \ + (js_files=[js], js_dirs=OPTS['d'], labels_files=OPTS['lf'], labels_dirs=OPTS['l'], \ + n=OPTS['n'][0], tolerance=OPTS['t'][0], dict_not_hash=OPTS['dnh'][0]) + if not names: + return 0 + _, labels_predicted_proba = classifier.test_model(names, labels, attributes, \ + model=MODEL, print_res=False, print_score=False) + malicious_proba = int(labels_predicted_proba[0][1]*100) + return malicious_proba + +def main(): + """ + Main function + """ + js = open(JAVASCRIPT, 'r') + copy = open(TMPFILENAME, 'w+') + line = js.readline() + n = 1 + begin_line = n + while line: + copy.write(str(line)) + copy.close() + if is_js_file(TMPFILENAME) == 0: + score = get_malicious_score(TMPFILENAME) + print('Line {} to {}: {}%'.format(begin_line, n, score)) + begin_line = n + 1 + copy = open(TMPFILENAME, 'w+') + else: + copy = open(TMPFILENAME, 'a+') + line = js.readline() + n += 1 + js.close() + +if __name__ == '__main__': + main()