Skip to content

Commit 24acc13

Browse files
jlevypaloaltomaimorag
authored andcommitted
Update docker ml (#35081)
* updated docker * added the rest * devdemisto/ml:1.0.0.100486 * fix tpb * return on no incidents * remove runonce * remove space * fixed * fix create incidents script * new docker * revert: fix create incidents script * add outputs to DBotFindSimilarIncidents * new tpb DBotFindSimilarIncidents-test * new docker * bump transformers * Empty-Commit * fix conf.json * more fixes * more fixes * new docker * RN * new docker * revert dockers * more stuff * redirect stderr * docker * format * format * RN * more stuff * build fixes * build fixes * fix unit-tests * more docker changes * more docker changes * build fixes * suppress logger * build fixes * build fixes
1 parent e748e6b commit 24acc13

File tree

40 files changed

+1144
-917
lines changed

40 files changed

+1144
-917
lines changed

Packs/Base/ReleaseNotes/1_34_28.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
2+
#### Scripts
3+
4+
##### DBotTrainTextClassifierV2
5+
6+
- Updated the Docker image to: *demisto/ml:1.0.0.101889*.
7+
##### DBotBuildPhishingClassifier
8+
9+
- Changed the Docker image to: *demisto/python3:3.11.9.101916*.
10+
##### DBotPreProcessTextData
11+
12+
- Updated the Docker image to: *demisto/ml:1.0.0.101889*.
13+
##### DBotPredictPhishingWords
14+
15+
- Updated the Docker image to: *demisto/ml:1.0.0.101889*.
16+
##### DBotFindSimilarIncidents
17+
18+
- Updated the Docker image to: *demisto/ml:1.0.0.101889*.
19+
##### GetMLModelEvaluation
20+
21+
- Updated the Docker image to: *demisto/ml:1.0.0.101889*.
22+
##### DBotFindSimilarIncidentsByIndicators
23+
24+
- Updated the Docker image to: *demisto/ml:1.0.0.101889*.

Packs/Base/Scripts/DBotBuildPhishingClassifier/DBotBuildPhishingClassifier.py

Lines changed: 13 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,12 @@
1+
from CommonServerPython import *
12
import base64
2-
import copy
33
import gc
44

5-
from CommonServerPython import *
6-
7-
PREFIXES_TO_REMOVE = ['incident.']
85
ALL_LABELS = "*"
96

107

118
def preprocess_incidents_field(incidents_field):
12-
incidents_field = incidents_field.strip()
13-
for prefix in PREFIXES_TO_REMOVE:
14-
if incidents_field.startswith(prefix):
15-
incidents_field = incidents_field[len(prefix):]
16-
return incidents_field
9+
return incidents_field.strip().removeprefix('incident.')
1710

1811

1912
def get_phishing_map_labels(comma_values):
@@ -28,7 +21,7 @@ def get_phishing_map_labels(comma_values):
2821
labels_dict[splited[0].strip()] = splited[1].strip()
2922
else:
3023
labels_dict[v] = v
31-
return {k: v for k, v in labels_dict.items()}
24+
return dict(labels_dict.items())
3225

3326

3427
def build_query_in_reepect_to_phishing_labels(args):
@@ -38,17 +31,17 @@ def build_query_in_reepect_to_phishing_labels(args):
3831
return args
3932
mapping_dict = get_phishing_map_labels(mapping)
4033
tag_field = args['tagField']
41-
tags_union = ' '.join(['"{}"'.format(label) for label in mapping_dict])
42-
mapping_query = '{}:({})'.format(tag_field, tags_union)
34+
tags_union = ' '.join([f'"{label}"' for label in mapping_dict])
35+
mapping_query = f'{tag_field}:({tags_union})'
4336
if 'query' not in args or args['query'].strip() == '':
4437
args['query'] = mapping_query
4538
else:
46-
args['query'] = '({}) and ({})'.format(query, mapping_query)
39+
args['query'] = f'({query}) and ({mapping_query})'
4740
return args
4841

4942

5043
def get_incidents(d_args):
51-
get_incidents_by_query_args = copy.deepcopy(d_args)
44+
get_incidents_by_query_args = d_args.copy()
5245
get_incidents_by_query_args['NonEmptyFields'] = d_args['tagField']
5346
fields_names_to_populate = ['tagField', 'emailsubject', 'emailbody', "emailbodyhtml"]
5447
fields_to_populate = [get_incidents_by_query_args.get(x, None) for x in fields_names_to_populate]
@@ -63,15 +56,15 @@ def get_incidents(d_args):
6356

6457

6558
def preprocess_incidents(incidents, d_args):
66-
text_pre_process_args = copy.deepcopy(d_args)
59+
text_pre_process_args = d_args.copy()
6760
text_pre_process_args['inputType'] = 'json_b64_string'
6861
text_pre_process_args['input'] = base64.b64encode(incidents.encode('utf-8')).decode('ascii')
6962
text_pre_process_args['preProcessType'] = 'nlp'
7063
email_body_fields = [text_pre_process_args.get("emailbody"), text_pre_process_args.get("emailbodyhtml")]
7164
email_body = "|".join([x for x in email_body_fields if x])
72-
text_pre_process_args['textFields'] = "%s,%s" % (text_pre_process_args['emailsubject'], email_body)
73-
text_pre_process_args['whitelistFields'] = "{0},{1}".format('dbot_processed_text',
74-
text_pre_process_args['tagField'])
65+
text_pre_process_args['textFields'] = "{},{}".format(text_pre_process_args['emailsubject'], email_body)
66+
text_pre_process_args['whitelistFields'] = "{},{}".format('dbot_processed_text',
67+
text_pre_process_args['tagField'])
7568
res = demisto.executeCommand("DBotPreProcessTextData", text_pre_process_args)
7669
if is_error(res):
7770
return_error(get_error(res))
@@ -81,7 +74,7 @@ def preprocess_incidents(incidents, d_args):
8174

8275

8376
def train_model(processed_text_data, d_args):
84-
train_model_args = copy.deepcopy(d_args)
77+
train_model_args = d_args.copy()
8578
train_model_args['inputType'] = 'json_b64_string'
8679
train_model_args['input'] = base64.b64encode(processed_text_data.encode('utf-8')).decode('ascii')
8780
train_model_args['overrideExistingModel'] = 'true'
@@ -90,7 +83,7 @@ def train_model(processed_text_data, d_args):
9083

9184

9285
def main():
93-
d_args = dict(demisto.args())
86+
d_args = demisto.args()
9487
for arg in ['tagField', 'emailbody', 'emailbodyhtml', 'emailsubject', 'timeField']:
9588
d_args[arg] = preprocess_incidents_field(d_args.get(arg, ''))
9689

Packs/Base/Scripts/DBotBuildPhishingClassifier/DBotBuildPhishingClassifier.yml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@ args:
44
- defaultValue: Phishing
55
description: A comma-separated list of incident types by which to filter.
66
name: incidentTypes
7-
- description: 'The start date by which to filter incidents. Date format will be the same as in the incidents query page (valid strings example: "3 days ago", ""2019-01-01T00:00:00 +0200")'
7+
- description: 'The start date by which to filter incidents. Date format will be the same as in the incidents query page (valid strings example: "3 days ago", ""2019-01-01T00:00:00 +0200").'
88
name: fromDate
9-
- description: 'The end date by which to filter incidents. Date format will be the same as in the incidents query page (valid strings example: "3 days ago", ""2019-01-01T00:00:00 +0200")'
9+
- description: 'The end date by which to filter incidents. Date format will be the same as in the incidents query page (valid strings example: "3 days ago", ""2019-01-01T00:00:00 +0200").'
1010
name: toDate
1111
- defaultValue: '3000'
1212
description: The maximum number of incidents to fetch.
@@ -39,7 +39,7 @@ args:
3939
- description: The model name to store in the system.
4040
name: modelName
4141
- defaultValue: '*'
42-
description: 'A comma-separated list of email tags values and mapping. The script considers only the tags specified in this field. You can map a label to another value by using this format: LABEL:MAPPED_LABEL. For example, for 4 values in email tag: malicious, credentials harvesting, inner communitcation, external legit email, unclassified. While training, we want to ignore "unclassified" tag, and refer to "credentials harvesting" as "malicious" too. Also, we want to merge "inner communitcation" and "external legit email" to one tag called "non-malicious". The input will be: malicious, credentials harvesting:malicious, inner communitcation:non-malicious, external legit email:non-malicious'
42+
description: 'A comma-separated list of email tags values and mapping. The script considers only the tags specified in this field. You can map a label to another value by using this format: LABEL:MAPPED_LABEL. For example, for 4 values in email tag: malicious, credentials harvesting, inner communitcation, external legit email, unclassified. While training, we want to ignore "unclassified" tag, and refer to "credentials harvesting" as "malicious" too. Also, we want to merge "inner communitcation" and "external legit email" to one tag called "non-malicious". The input will be: malicious, credentials harvesting:malicious, inner communitcation:non-malicious, external legit email:non-malicious.'
4343
name: phishingLabels
4444
- defaultValue: emailsubject
4545
description: Incident field name with the email subject.
@@ -83,8 +83,7 @@ tags:
8383
- ml
8484
timeout: 12µs
8585
type: python
86-
dockerimage: demisto/ml:1.0.0.45981
87-
runonce: true
86+
dockerimage: demisto/python3:3.11.9.101916
8887
tests:
8988
- Create Phishing Classifier V2 ML Test
9089
- DBotCreatePhishingClassifierV2FromFile-Test

Packs/Base/Scripts/DBotBuildPhishingClassifier/DBotBuildPhishingClassifier_test.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ def test_no_mapping_no_query():
1313
def test_no_mapping_with_query():
1414
args = {'phishingLabels': '*', 'query': QUERY}
1515
args = build_query_in_reepect_to_phishing_labels(args)
16-
assert 'query' in args and args['query'] == QUERY
16+
assert 'query' in args
17+
assert args['query'] == QUERY
1718

1819

1920
def test_mapping_no_query():
@@ -27,6 +28,6 @@ def test_mapping_with_query():
2728
args = {'phishingLabels': MAPPING, 'tagField': 'closeReason', 'query': QUERY}
2829
args = build_query_in_reepect_to_phishing_labels(args)
2930
assert 'query' in args
30-
opt1 = args['query'] == '({}) and (closeReason:("spam" "legit"))'.format(QUERY)
31-
opt2 = args['query'] == '({}) and (closeReason:("legit" "spam"))'.format(QUERY)
31+
opt1 = args['query'] == f'({QUERY}) and (closeReason:("spam" "legit"))'
32+
opt2 = args['query'] == f'({QUERY}) and (closeReason:("legit" "spam"))'
3233
assert opt1 or opt2

Packs/Base/Scripts/DBotFindSimilarIncidents/DBotFindSimilarIncidents.yml

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,27 @@ script: '-'
8686
subtype: python3
8787
timeout: '0'
8888
type: python
89-
dockerimage: demisto/ml:1.0.0.94241
89+
dockerimage: demisto/ml:1.0.0.101889
9090
runas: DBotWeakRole
91-
runonce: true
9291
tests:
93-
- No tests (auto formatted)
92+
- DBotFindSimilarIncidents-test
9493
fromversion: 5.0.0
94+
outputs:
95+
- contextPath: DBotFindSimilarIncidents.isSimilarIncidentFound
96+
description: Indicates whether similar incidents have been found.
97+
type: boolean
98+
- contextPath: DBotFindSimilarIncidents.similarIncident.created
99+
description: The creation date of the linked incident.
100+
type: date
101+
- contextPath: DBotFindSimilarIncidents.similarIncident.id
102+
description: The ID of the linked incident.
103+
type: string
104+
- contextPath: DBotFindSimilarIncidents.similarIncident.name
105+
description: The name of the linked incident.
106+
type: string
107+
- contextPath: DBotFindSimilarIncidents.similarIncident.similarity incident
108+
description: The similarity of the linked incident represented as a float in the range 0-1.
109+
type: number
110+
- contextPath: DBotFindSimilarIncidents.similarIncident.details
111+
description: The details of the linked incident.
112+
type: string

Packs/Base/Scripts/DBotFindSimilarIncidentsByIndicators/DBotFindSimilarIncidentsByIndicators.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ script: '-'
4242
subtype: python3
4343
timeout: '0'
4444
type: python
45-
dockerimage: demisto/ml:1.0.0.88591
45+
dockerimage: demisto/ml:1.0.0.101889
4646
runas: DBotWeakRole
4747
tests:
4848
- DBotFindSimilarIncidentsByIndicators - Test

Packs/Base/Scripts/DBotPredictPhishingWords/DBotPredictPhishingWords.py

Lines changed: 38 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
# pylint: disable=no-member
2-
32
from CommonServerPython import *
43
from string import punctuation
54
import demisto_ml
65
import numpy as np
6+
import logging
7+
8+
# Suppress logging for a specific library
9+
logging.getLogger('transformers').setLevel(logging.ERROR)
710

811
FASTTEXT_MODEL_TYPE = 'FASTTEXT_MODEL_TYPE'
912
TORCH_TYPE = 'torch'
@@ -14,27 +17,30 @@ def OrderedSet(iterable):
1417
return list(dict.fromkeys(iterable))
1518

1619

17-
def get_model_data(model_name, store_type, is_return_error):
18-
res_model_list = demisto.executeCommand("getList", {"listName": model_name})[0]
19-
res_model = demisto.executeCommand("getMLModel", {"modelName": model_name})[0]
20-
if is_error(res_model_list) and not is_error(res_model):
21-
model_data = res_model['Contents']['modelData']
22-
try:
23-
model_type = res_model['Contents']['model']["type"]["type"]
24-
return model_data, model_type
25-
except Exception:
26-
return model_data, UNKNOWN_MODEL_TYPE
27-
elif not is_error(res_model_list) and is_error(res_model):
28-
return res_model_list["Contents"], UNKNOWN_MODEL_TYPE
29-
elif not is_error(res_model_list) and not is_error(res_model):
30-
if store_type == "list":
31-
return res_model_list["Contents"], UNKNOWN_MODEL_TYPE
32-
elif store_type == "mlModel":
33-
model_data = res_model['Contents']['modelData']
34-
model_type = res_model['Contents']['model']["type"]["type"]
35-
return model_data, model_type
36-
else:
37-
handle_error("error reading model %s from Demisto" % model_name, is_return_error)
20+
def get_model_data(model_name: str, store_type: str, is_return_error: bool) -> tuple[dict, str]:
21+
22+
def load_from_models(model_name: str) -> None | tuple[dict, str]:
23+
res_model = demisto.executeCommand("getMLModel", {"modelName": model_name})
24+
if is_error(res_model):
25+
demisto.debug(get_error(res_model))
26+
return None
27+
model_data = res_model[0]['Contents']['modelData']
28+
model_type = dict_safe_get(res_model, [0, 'Contents', 'model', "type", "type"], UNKNOWN_MODEL_TYPE)
29+
return model_data, model_type
30+
31+
def load_from_list(model_name):
32+
res_model = demisto.executeCommand("getList", {"listName": model_name})
33+
if is_error(res_model):
34+
demisto.debug(get_error(res_model))
35+
return None
36+
return res_model[0]["Contents"], UNKNOWN_MODEL_TYPE
37+
38+
if store_type == "mlModel":
39+
res = load_from_models(model_name) or load_from_list(model_name)
40+
elif store_type == "list":
41+
res = load_from_list(model_name) or load_from_models(model_name)
42+
43+
return res or handle_error(f"error reading model {model_name} from Demisto", is_return_error) # type: ignore
3844

3945

4046
def handle_error(message, is_return_error):
@@ -88,6 +94,7 @@ def preprocess_text(text, model_type, is_return_error):
8894
else:
8995
words_to_token_maps = tokenized_text_result['originalWordsToTokens']
9096
return input_text, words_to_token_maps
97+
return None
9198

9299

93100
def predict_phishing_words(model_name, model_store_type, email_subject, email_body, min_text_length, label_threshold,
@@ -97,7 +104,9 @@ def predict_phishing_words(model_name, model_store_type, email_subject, email_bo
97104
model_type = FASTTEXT_MODEL_TYPE
98105
if model_type not in [FASTTEXT_MODEL_TYPE, TORCH_TYPE, UNKNOWN_MODEL_TYPE]:
99106
model_type = UNKNOWN_MODEL_TYPE
107+
100108
phishing_model = demisto_ml.phishing_model_loads_handler(model_data, model_type)
109+
101110
is_model_applied_on_a_single_incidents = isinstance(email_subject, str) and isinstance(email_body, str)
102111
if is_model_applied_on_a_single_incidents:
103112
return predict_single_incident_full_output(email_subject, email_body, is_return_error, label_threshold,
@@ -110,7 +119,7 @@ def predict_phishing_words(model_name, model_store_type, email_subject, email_bo
110119

111120

112121
def predict_batch_incidents_light_output(email_subject, email_body, phishing_model, model_type, min_text_length):
113-
text_list = [{'text': "%s \n%s" % (subject, body)} for subject, body in zip(email_subject, email_body)]
122+
text_list = [{'text': f"{subject} \n{body}"} for subject, body in zip(email_subject, email_body)]
114123
preprocessed_text_list = preprocess_text(text_list, model_type, is_return_error=False)
115124
batch_predictions = []
116125
for input_text in preprocessed_text_list:
@@ -132,14 +141,14 @@ def predict_batch_incidents_light_output(email_subject, email_body, phishing_mod
132141
'Type': entryTypes['note'],
133142
'Contents': batch_predictions,
134143
'ContentsFormat': formats['json'],
135-
'HumanReadable': 'Applied predictions on {} incidents.'.format(len(batch_predictions)),
144+
'HumanReadable': f'Applied predictions on {len(batch_predictions)} incidents.',
136145
}
137146

138147

139148
def predict_single_incident_full_output(email_subject, email_body, is_return_error, label_threshold, min_text_length,
140149
model_type, phishing_model, set_incidents_fields, top_word_limit,
141150
word_threshold):
142-
text = "%s \n%s" % (email_subject, email_body)
151+
text = f"{email_subject} \n{email_body}"
143152
input_text, words_to_token_maps = preprocess_text(text, model_type, is_return_error)
144153
filtered_text, filtered_text_number_of_words = phishing_model.filter_model_words(input_text)
145154
if filtered_text_number_of_words == 0:
@@ -163,22 +172,22 @@ def predict_single_incident_full_output(email_subject, email_body, is_return_err
163172
negative_tokens = OrderedSet(explain_result['NegativeWords'])
164173
positive_words = find_words_contain_tokens(positive_tokens, words_to_token_maps)
165174
negative_words = find_words_contain_tokens(negative_tokens, words_to_token_maps)
166-
positive_words = list(OrderedSet([s.strip(punctuation) for s in positive_words]))
167-
negative_words = list(OrderedSet([s.strip(punctuation) for s in negative_words]))
175+
positive_words = OrderedSet([s.strip(punctuation) for s in positive_words])
176+
negative_words = OrderedSet([s.strip(punctuation) for s in negative_words])
168177
positive_words = [w for w in positive_words if w.isalnum()]
169178
negative_words = [w for w in negative_words if w.isalnum()]
170179
highlighted_text_markdown = text.strip()
171180
for word in positive_words:
172181
for cased_word in [word.lower(), word.title(), word.upper()]:
173-
highlighted_text_markdown = re.sub(r'(?<!\w)({})(?!\w)'.format(cased_word), '**{}**'.format(cased_word),
182+
highlighted_text_markdown = re.sub(fr'(?<!\w)({cased_word})(?!\w)', f'**{cased_word}**',
174183
highlighted_text_markdown)
175184
highlighted_text_markdown = re.sub(r'\n+', '\n', highlighted_text_markdown)
176185
explain_result['PositiveWords'] = [w.lower() for w in positive_words]
177186
explain_result['NegativeWords'] = [w.lower() for w in negative_words]
178187
explain_result['OriginalText'] = text.strip()
179188
explain_result['TextTokensHighlighted'] = highlighted_text_markdown
180189
predicted_label = explain_result["Label"]
181-
explain_result_hr = dict()
190+
explain_result_hr = {}
182191
explain_result_hr['TextTokensHighlighted'] = highlighted_text_markdown
183192
explain_result_hr['Label'] = predicted_label
184193
explain_result_hr['Probability'] = "%.2f" % predicted_prob

Packs/Base/Scripts/DBotPredictPhishingWords/DBotPredictPhishingWords.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,7 @@ tags:
9898
- phishing
9999
timeout: 60µs
100100
type: python
101-
dockerimage: demisto/ml:1.0.0.32340
102-
runonce: true
101+
dockerimage: demisto/ml:1.0.0.101889
103102
tests:
104103
- Create Phishing Classifier V2 ML Test
105104
fromversion: 5.0.0

0 commit comments

Comments
 (0)