From c07549c603010c68862e876a01c7d877cd671ac7 Mon Sep 17 00:00:00 2001 From: Federico Gibertoni Date: Wed, 11 Dec 2024 12:13:00 +0100 Subject: [PATCH 01/10] Added more log messages --- .../phishing/phishing_form_compiler.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py b/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py index d2cedd0e3c..9a71df4b94 100644 --- a/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py +++ b/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py @@ -139,7 +139,12 @@ def compile_form_field(self, form) -> (dict, str): result: {} = {} # setting default to page itself if action is not specified if not (form_action := form.get("action", None)): + logger.info( + f"'action' attribute not found in form. Defaulting to {self.target_site=}" + ) form_action = self.target_site + logger.info(f"Extracted action to post data to: {form_action}") + for element in form.findall(".//input"): input_type: str = element.get("type", None) input_name: str = element.get("name", None) @@ -186,11 +191,19 @@ def perform_request_to_form(self, form) -> Response: @staticmethod def handle_3xx_response(response: Response) -> [str]: + result: [] = [] # extract all redirection history - return [history.request.url for history in response.history] + for history in response.history: + logger.info( + f"Extracting 3xx {response.status_code} HTTP response with url {history.request.url}" + ) + result.append(history.request.url) @staticmethod def handle_2xx_response(response: Response) -> str: + logger.info( + f"Extracting 2xx {response.status_code} response with url {response.request.url}" + ) return response.request.url def is_js_used_in_page(self) -> bool: From 01d3ad525dc25a8ef5cd45be5b4014d97a590db5 Mon Sep 17 00:00:00 2001 From: Federico Gibertoni Date: Wed, 11 Dec 2024 16:55:00 +0100 Subject: [PATCH 02/10] Added action extract method with relative url --- .../phishing/phishing_form_compiler.py | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py b/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py index 9a71df4b94..543ec5af71 100644 --- a/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py +++ b/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py @@ -1,6 +1,7 @@ import logging from datetime import date, timedelta from typing import Dict +from urllib.parse import urlparse import requests from faker import Faker @@ -135,15 +136,30 @@ def identify_text_input(self, input_name: str) -> str: if input_name in names: return fake_value - def compile_form_field(self, form) -> (dict, str): - result: {} = {} - # setting default to page itself if action is not specified + def _extract_action_attribute(self, form) -> str: if not (form_action := form.get("action", None)): logger.info( f"'action' attribute not found in form. Defaulting to {self.target_site=}" ) form_action = self.target_site + + # if relative url extracted, clean it from '/' and concatenate everything + # if action was not extracted in previous step the if should not pass as it is a url + if not urlparse(form_action).netloc: + logger.info(f"Found relative url in {form_action=}") + base_site = self.target_site + if base_site.endswith("/"): + base_site = base_site[:-1] + if form_action.startswith("/"): + form_action = form_action.replace("/", "", 1) + + form_action = base_site + "/" + form_action + logger.info(f"Extracted action to post data to: {form_action}") + return form_action + + def compile_form_field(self, form) -> dict: + result: {} = {} for element in form.findall(".//input"): input_type: str = element.get("type", None) @@ -174,10 +190,11 @@ def compile_form_field(self, form) -> (dict, str): f"Job #{self.job_id}: Sending value {value_to_set} for {input_name=}" ) result.setdefault(input_name, value_to_set) - return result, form_action + return result def perform_request_to_form(self, form) -> Response: - params, dest_url = self.compile_form_field(form) + params = self.compile_form_field(form) + dest_url = self._extract_action_attribute(form) logger.info(f"Job #{self.job_id}: Sending {params=} to submit url {dest_url}") return requests.post( url=dest_url, From 5b5c91de4c54263d9eb2d6a46b5b20ae4f4ca02f Mon Sep 17 00:00:00 2001 From: Cristina Ascari <95929371+cristinaascari@users.noreply.github.com> Date: Thu, 12 Dec 2024 15:16:20 +0100 Subject: [PATCH 03/10] added data model index in analyzer report (#2597) --- ...erreport_analyzers_m_data_mo_a1952b_idx.py | 20 +++++++++++++++++++ api_app/analyzers_manager/models.py | 4 +++- 2 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 api_app/analyzers_manager/migrations/0140_analyzerreport_analyzers_m_data_mo_a1952b_idx.py diff --git a/api_app/analyzers_manager/migrations/0140_analyzerreport_analyzers_m_data_mo_a1952b_idx.py b/api_app/analyzers_manager/migrations/0140_analyzerreport_analyzers_m_data_mo_a1952b_idx.py new file mode 100644 index 0000000000..5d4e380c9c --- /dev/null +++ b/api_app/analyzers_manager/migrations/0140_analyzerreport_analyzers_m_data_mo_a1952b_idx.py @@ -0,0 +1,20 @@ +# Generated by Django 4.2.16 on 2024-12-12 11:45 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("analyzers_manager", "0139_alter_analyzerconfig_mapping_data_model"), + ] + + operations = [ + migrations.AddIndex( + model_name="analyzerreport", + index=models.Index( + fields=["data_model_content_type", "data_model_object_id"], + name="analyzers_m_data_mo_a1952b_idx", + ), + ), + ] diff --git a/api_app/analyzers_manager/models.py b/api_app/analyzers_manager/models.py index 876c3cc68a..385faa8878 100644 --- a/api_app/analyzers_manager/models.py +++ b/api_app/analyzers_manager/models.py @@ -50,7 +50,9 @@ class AnalyzerReport(AbstractReport): class Meta: unique_together = [("config", "job")] - indexes = AbstractReport.Meta.indexes + indexes = AbstractReport.Meta.indexes + [ + models.Index(fields=["data_model_content_type", "data_model_object_id"]) + ] def clean(self): if self.data_model_content_type: From 841fd4e18e57625af212f7fa6c446e9db68c0fe0 Mon Sep 17 00:00:00 2001 From: Federico Gibertoni Date: Thu, 12 Dec 2024 15:38:49 +0100 Subject: [PATCH 04/10] Added headers logging to help with client error --- .../file_analyzers/phishing/phishing_form_compiler.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py b/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py index 543ec5af71..7c6c561b09 100644 --- a/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py +++ b/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py @@ -196,7 +196,7 @@ def perform_request_to_form(self, form) -> Response: params = self.compile_form_field(form) dest_url = self._extract_action_attribute(form) logger.info(f"Job #{self.job_id}: Sending {params=} to submit url {dest_url}") - return requests.post( + response = requests.post( url=dest_url, data=params, proxies=( @@ -205,6 +205,8 @@ def perform_request_to_form(self, form) -> Response: else None ), ) + logger.info(f"Request headers: {response.request.headers}") + return response @staticmethod def handle_3xx_response(response: Response) -> [str]: @@ -232,6 +234,7 @@ def is_js_used_in_page(self) -> bool: def analyze_responses(self, responses: [Response]) -> {}: result: [] = [] for response in responses: + logger.info(f"Response headers for {response.url}: {response.headers}") try: # handle 4xx and 5xx response.raise_for_status() From 1b72a26082145a727a51e55f1c7eabdda19566be Mon Sep 17 00:00:00 2001 From: Federico Gibertoni Date: Thu, 12 Dec 2024 17:10:24 +0100 Subject: [PATCH 05/10] Added wait for page to appear --- .../analyzers/driver_wrapper.py | 19 +++++++++++++++---- .../analyzers/extract_phishing_site.py | 2 +- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/integrations/phishing_analyzers/analyzers/driver_wrapper.py b/integrations/phishing_analyzers/analyzers/driver_wrapper.py index 5af03079cc..b6cf2feb91 100644 --- a/integrations/phishing_analyzers/analyzers/driver_wrapper.py +++ b/integrations/phishing_analyzers/analyzers/driver_wrapper.py @@ -4,6 +4,9 @@ from typing import Iterator from selenium.common import WebDriverException +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.wait import WebDriverWait from seleniumwire.request import Request from seleniumwire.webdriver import ChromeOptions, Remote @@ -40,7 +43,8 @@ def handle_exception(self, *args, **kwargs): f"Error while performing {func.__name__}" f"{' for url=' + url if func.__name__ == 'navigate' else ''}: {e}" ) - self.restart(motivation=func.__name__) + # default is 5 + self.restart(motivation=func.__name__, timeout_wait_page=5) func(self, *args, **kwargs) return handle_exception @@ -90,7 +94,7 @@ def _init_driver(self, window_width: int, window_height: int) -> Remote: ) return driver - def restart(self, motivation: str = ""): + def restart(self, motivation: str = "", timeout_wait_page: int = 0): logger.info(f"Restarting driver: {motivation=}") self._driver.quit() self._driver = self._init_driver( @@ -98,10 +102,10 @@ def restart(self, motivation: str = ""): ) if self.last_url: logger.info(f"Navigating to {self.last_url} after driver has restarted") - self.navigate(self.last_url) + self.navigate(self.last_url, timeout_wait_page=timeout_wait_page) @driver_exception_handler - def navigate(self, url: str = ""): + def navigate(self, url: str = "", timeout_wait_page: int = 0): if not url: logger.error("Empty URL! Something's wrong!") return @@ -109,6 +113,13 @@ def navigate(self, url: str = ""): self.last_url = url logger.info(f"Navigating to {url=}") self._driver.get(url) + # dinamically wait for page to load its content with a fallback + # of `timeout_wait_page` seconds. + # waiting to see if any visible input tag appears + if timeout_wait_page: + WebDriverWait(self._driver, timeout=timeout_wait_page).until( + EC.visibility_of_any_elements_located((By.TAG_NAME, "input")) + ) @driver_exception_handler def get_page_source(self) -> str: diff --git a/integrations/phishing_analyzers/analyzers/extract_phishing_site.py b/integrations/phishing_analyzers/analyzers/extract_phishing_site.py index a03e0bfa01..060323def6 100644 --- a/integrations/phishing_analyzers/analyzers/extract_phishing_site.py +++ b/integrations/phishing_analyzers/analyzers/extract_phishing_site.py @@ -57,7 +57,7 @@ def analyze_target( window_width=window_width, window_height=window_height, ) - driver_wrapper.navigate(url=target_url) + driver_wrapper.navigate(url=target_url, timeout_wait_page=5) result: str = json.dumps(extract_driver_result(driver_wrapper), default=str) logger.debug(f"JSON dump of driver {result=}") From c5db180d319afae142e20617464f97ecce001208 Mon Sep 17 00:00:00 2001 From: Federico Gibertoni Date: Wed, 11 Dec 2024 12:13:00 +0100 Subject: [PATCH 06/10] Added more log messages --- .../phishing/phishing_form_compiler.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py b/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py index d2cedd0e3c..9a71df4b94 100644 --- a/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py +++ b/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py @@ -139,7 +139,12 @@ def compile_form_field(self, form) -> (dict, str): result: {} = {} # setting default to page itself if action is not specified if not (form_action := form.get("action", None)): + logger.info( + f"'action' attribute not found in form. Defaulting to {self.target_site=}" + ) form_action = self.target_site + logger.info(f"Extracted action to post data to: {form_action}") + for element in form.findall(".//input"): input_type: str = element.get("type", None) input_name: str = element.get("name", None) @@ -186,11 +191,19 @@ def perform_request_to_form(self, form) -> Response: @staticmethod def handle_3xx_response(response: Response) -> [str]: + result: [] = [] # extract all redirection history - return [history.request.url for history in response.history] + for history in response.history: + logger.info( + f"Extracting 3xx {response.status_code} HTTP response with url {history.request.url}" + ) + result.append(history.request.url) @staticmethod def handle_2xx_response(response: Response) -> str: + logger.info( + f"Extracting 2xx {response.status_code} response with url {response.request.url}" + ) return response.request.url def is_js_used_in_page(self) -> bool: From 6435bae1eda86a1267cb5b409322ff01b9dd4586 Mon Sep 17 00:00:00 2001 From: Federico Gibertoni Date: Wed, 11 Dec 2024 16:55:00 +0100 Subject: [PATCH 07/10] Added action extract method with relative url --- .../phishing/phishing_form_compiler.py | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py b/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py index 9a71df4b94..543ec5af71 100644 --- a/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py +++ b/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py @@ -1,6 +1,7 @@ import logging from datetime import date, timedelta from typing import Dict +from urllib.parse import urlparse import requests from faker import Faker @@ -135,15 +136,30 @@ def identify_text_input(self, input_name: str) -> str: if input_name in names: return fake_value - def compile_form_field(self, form) -> (dict, str): - result: {} = {} - # setting default to page itself if action is not specified + def _extract_action_attribute(self, form) -> str: if not (form_action := form.get("action", None)): logger.info( f"'action' attribute not found in form. Defaulting to {self.target_site=}" ) form_action = self.target_site + + # if relative url extracted, clean it from '/' and concatenate everything + # if action was not extracted in previous step the if should not pass as it is a url + if not urlparse(form_action).netloc: + logger.info(f"Found relative url in {form_action=}") + base_site = self.target_site + if base_site.endswith("/"): + base_site = base_site[:-1] + if form_action.startswith("/"): + form_action = form_action.replace("/", "", 1) + + form_action = base_site + "/" + form_action + logger.info(f"Extracted action to post data to: {form_action}") + return form_action + + def compile_form_field(self, form) -> dict: + result: {} = {} for element in form.findall(".//input"): input_type: str = element.get("type", None) @@ -174,10 +190,11 @@ def compile_form_field(self, form) -> (dict, str): f"Job #{self.job_id}: Sending value {value_to_set} for {input_name=}" ) result.setdefault(input_name, value_to_set) - return result, form_action + return result def perform_request_to_form(self, form) -> Response: - params, dest_url = self.compile_form_field(form) + params = self.compile_form_field(form) + dest_url = self._extract_action_attribute(form) logger.info(f"Job #{self.job_id}: Sending {params=} to submit url {dest_url}") return requests.post( url=dest_url, From 012214e87fdbb624a8eaefa686cd91bb09891463 Mon Sep 17 00:00:00 2001 From: Federico Gibertoni Date: Thu, 12 Dec 2024 15:38:49 +0100 Subject: [PATCH 08/10] Added headers logging to help with client error --- .../file_analyzers/phishing/phishing_form_compiler.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py b/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py index 543ec5af71..7c6c561b09 100644 --- a/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py +++ b/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py @@ -196,7 +196,7 @@ def perform_request_to_form(self, form) -> Response: params = self.compile_form_field(form) dest_url = self._extract_action_attribute(form) logger.info(f"Job #{self.job_id}: Sending {params=} to submit url {dest_url}") - return requests.post( + response = requests.post( url=dest_url, data=params, proxies=( @@ -205,6 +205,8 @@ def perform_request_to_form(self, form) -> Response: else None ), ) + logger.info(f"Request headers: {response.request.headers}") + return response @staticmethod def handle_3xx_response(response: Response) -> [str]: @@ -232,6 +234,7 @@ def is_js_used_in_page(self) -> bool: def analyze_responses(self, responses: [Response]) -> {}: result: [] = [] for response in responses: + logger.info(f"Response headers for {response.url}: {response.headers}") try: # handle 4xx and 5xx response.raise_for_status() From ae9ea09246d8d6483c9d6db1382833e49f0bc6ce Mon Sep 17 00:00:00 2001 From: Federico Gibertoni Date: Thu, 12 Dec 2024 17:10:24 +0100 Subject: [PATCH 09/10] Added wait for page to appear --- .../analyzers/driver_wrapper.py | 19 +++++++++++++++---- .../analyzers/extract_phishing_site.py | 2 +- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/integrations/phishing_analyzers/analyzers/driver_wrapper.py b/integrations/phishing_analyzers/analyzers/driver_wrapper.py index 5af03079cc..b6cf2feb91 100644 --- a/integrations/phishing_analyzers/analyzers/driver_wrapper.py +++ b/integrations/phishing_analyzers/analyzers/driver_wrapper.py @@ -4,6 +4,9 @@ from typing import Iterator from selenium.common import WebDriverException +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.wait import WebDriverWait from seleniumwire.request import Request from seleniumwire.webdriver import ChromeOptions, Remote @@ -40,7 +43,8 @@ def handle_exception(self, *args, **kwargs): f"Error while performing {func.__name__}" f"{' for url=' + url if func.__name__ == 'navigate' else ''}: {e}" ) - self.restart(motivation=func.__name__) + # default is 5 + self.restart(motivation=func.__name__, timeout_wait_page=5) func(self, *args, **kwargs) return handle_exception @@ -90,7 +94,7 @@ def _init_driver(self, window_width: int, window_height: int) -> Remote: ) return driver - def restart(self, motivation: str = ""): + def restart(self, motivation: str = "", timeout_wait_page: int = 0): logger.info(f"Restarting driver: {motivation=}") self._driver.quit() self._driver = self._init_driver( @@ -98,10 +102,10 @@ def restart(self, motivation: str = ""): ) if self.last_url: logger.info(f"Navigating to {self.last_url} after driver has restarted") - self.navigate(self.last_url) + self.navigate(self.last_url, timeout_wait_page=timeout_wait_page) @driver_exception_handler - def navigate(self, url: str = ""): + def navigate(self, url: str = "", timeout_wait_page: int = 0): if not url: logger.error("Empty URL! Something's wrong!") return @@ -109,6 +113,13 @@ def navigate(self, url: str = ""): self.last_url = url logger.info(f"Navigating to {url=}") self._driver.get(url) + # dinamically wait for page to load its content with a fallback + # of `timeout_wait_page` seconds. + # waiting to see if any visible input tag appears + if timeout_wait_page: + WebDriverWait(self._driver, timeout=timeout_wait_page).until( + EC.visibility_of_any_elements_located((By.TAG_NAME, "input")) + ) @driver_exception_handler def get_page_source(self) -> str: diff --git a/integrations/phishing_analyzers/analyzers/extract_phishing_site.py b/integrations/phishing_analyzers/analyzers/extract_phishing_site.py index a03e0bfa01..060323def6 100644 --- a/integrations/phishing_analyzers/analyzers/extract_phishing_site.py +++ b/integrations/phishing_analyzers/analyzers/extract_phishing_site.py @@ -57,7 +57,7 @@ def analyze_target( window_width=window_width, window_height=window_height, ) - driver_wrapper.navigate(url=target_url) + driver_wrapper.navigate(url=target_url, timeout_wait_page=5) result: str = json.dumps(extract_driver_result(driver_wrapper), default=str) logger.debug(f"JSON dump of driver {result=}") From 9800c080b47e0f38a49d899fc764a347f42906db Mon Sep 17 00:00:00 2001 From: Federico Gibertoni Date: Fri, 13 Dec 2024 11:23:16 +0100 Subject: [PATCH 10/10] Removed additional underscore in function name --- .../file_analyzers/phishing/phishing_form_compiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py b/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py index 7c6c561b09..bd4719511a 100644 --- a/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py +++ b/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py @@ -136,7 +136,7 @@ def identify_text_input(self, input_name: str) -> str: if input_name in names: return fake_value - def _extract_action_attribute(self, form) -> str: + def extract_action_attribute(self, form) -> str: if not (form_action := form.get("action", None)): logger.info( f"'action' attribute not found in form. Defaulting to {self.target_site=}" @@ -194,7 +194,7 @@ def compile_form_field(self, form) -> dict: def perform_request_to_form(self, form) -> Response: params = self.compile_form_field(form) - dest_url = self._extract_action_attribute(form) + dest_url = self.extract_action_attribute(form) logger.info(f"Job #{self.job_id}: Sending {params=} to submit url {dest_url}") response = requests.post( url=dest_url,