diff --git a/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py b/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py index d2cedd0e3c..bd4719511a 100644 --- a/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py +++ b/api_app/analyzers_manager/file_analyzers/phishing/phishing_form_compiler.py @@ -1,6 +1,7 @@ import logging from datetime import date, timedelta from typing import Dict +from urllib.parse import urlparse import requests from faker import Faker @@ -135,11 +136,31 @@ def identify_text_input(self, input_name: str) -> str: if input_name in names: return fake_value - def compile_form_field(self, form) -> (dict, str): - result: {} = {} - # setting default to page itself if action is not specified + def extract_action_attribute(self, form) -> str: if not (form_action := form.get("action", None)): + logger.info( + f"'action' attribute not found in form. Defaulting to {self.target_site=}" + ) form_action = self.target_site + + # if relative url extracted, clean it from '/' and concatenate everything + # if action was not extracted in previous step the if should not pass as it is a url + if not urlparse(form_action).netloc: + logger.info(f"Found relative url in {form_action=}") + base_site = self.target_site + if base_site.endswith("/"): + base_site = base_site[:-1] + if form_action.startswith("/"): + form_action = form_action.replace("/", "", 1) + + form_action = base_site + "/" + form_action + + logger.info(f"Extracted action to post data to: {form_action}") + return form_action + + def compile_form_field(self, form) -> dict: + result: {} = {} + for element in form.findall(".//input"): input_type: str = element.get("type", None) input_name: str = element.get("name", None) @@ -169,12 +190,13 @@ def compile_form_field(self, form) -> (dict, str): f"Job #{self.job_id}: Sending value {value_to_set} for {input_name=}" ) result.setdefault(input_name, value_to_set) - return result, form_action + return result def perform_request_to_form(self, form) -> Response: - params, dest_url = self.compile_form_field(form) + params = self.compile_form_field(form) + dest_url = self.extract_action_attribute(form) logger.info(f"Job #{self.job_id}: Sending {params=} to submit url {dest_url}") - return requests.post( + response = requests.post( url=dest_url, data=params, proxies=( @@ -183,14 +205,24 @@ def perform_request_to_form(self, form) -> Response: else None ), ) + logger.info(f"Request headers: {response.request.headers}") + return response @staticmethod def handle_3xx_response(response: Response) -> [str]: + result: [] = [] # extract all redirection history - return [history.request.url for history in response.history] + for history in response.history: + logger.info( + f"Extracting 3xx {response.status_code} HTTP response with url {history.request.url}" + ) + result.append(history.request.url) @staticmethod def handle_2xx_response(response: Response) -> str: + logger.info( + f"Extracting 2xx {response.status_code} response with url {response.request.url}" + ) return response.request.url def is_js_used_in_page(self) -> bool: @@ -202,6 +234,7 @@ def is_js_used_in_page(self) -> bool: def analyze_responses(self, responses: [Response]) -> {}: result: [] = [] for response in responses: + logger.info(f"Response headers for {response.url}: {response.headers}") try: # handle 4xx and 5xx response.raise_for_status() diff --git a/api_app/analyzers_manager/migrations/0140_analyzerreport_analyzers_m_data_mo_a1952b_idx.py b/api_app/analyzers_manager/migrations/0140_analyzerreport_analyzers_m_data_mo_a1952b_idx.py new file mode 100644 index 0000000000..5d4e380c9c --- /dev/null +++ b/api_app/analyzers_manager/migrations/0140_analyzerreport_analyzers_m_data_mo_a1952b_idx.py @@ -0,0 +1,20 @@ +# Generated by Django 4.2.16 on 2024-12-12 11:45 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("analyzers_manager", "0139_alter_analyzerconfig_mapping_data_model"), + ] + + operations = [ + migrations.AddIndex( + model_name="analyzerreport", + index=models.Index( + fields=["data_model_content_type", "data_model_object_id"], + name="analyzers_m_data_mo_a1952b_idx", + ), + ), + ] diff --git a/api_app/analyzers_manager/models.py b/api_app/analyzers_manager/models.py index 876c3cc68a..385faa8878 100644 --- a/api_app/analyzers_manager/models.py +++ b/api_app/analyzers_manager/models.py @@ -50,7 +50,9 @@ class AnalyzerReport(AbstractReport): class Meta: unique_together = [("config", "job")] - indexes = AbstractReport.Meta.indexes + indexes = AbstractReport.Meta.indexes + [ + models.Index(fields=["data_model_content_type", "data_model_object_id"]) + ] def clean(self): if self.data_model_content_type: diff --git a/integrations/phishing_analyzers/analyzers/driver_wrapper.py b/integrations/phishing_analyzers/analyzers/driver_wrapper.py index 5af03079cc..b6cf2feb91 100644 --- a/integrations/phishing_analyzers/analyzers/driver_wrapper.py +++ b/integrations/phishing_analyzers/analyzers/driver_wrapper.py @@ -4,6 +4,9 @@ from typing import Iterator from selenium.common import WebDriverException +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.wait import WebDriverWait from seleniumwire.request import Request from seleniumwire.webdriver import ChromeOptions, Remote @@ -40,7 +43,8 @@ def handle_exception(self, *args, **kwargs): f"Error while performing {func.__name__}" f"{' for url=' + url if func.__name__ == 'navigate' else ''}: {e}" ) - self.restart(motivation=func.__name__) + # default is 5 + self.restart(motivation=func.__name__, timeout_wait_page=5) func(self, *args, **kwargs) return handle_exception @@ -90,7 +94,7 @@ def _init_driver(self, window_width: int, window_height: int) -> Remote: ) return driver - def restart(self, motivation: str = ""): + def restart(self, motivation: str = "", timeout_wait_page: int = 0): logger.info(f"Restarting driver: {motivation=}") self._driver.quit() self._driver = self._init_driver( @@ -98,10 +102,10 @@ def restart(self, motivation: str = ""): ) if self.last_url: logger.info(f"Navigating to {self.last_url} after driver has restarted") - self.navigate(self.last_url) + self.navigate(self.last_url, timeout_wait_page=timeout_wait_page) @driver_exception_handler - def navigate(self, url: str = ""): + def navigate(self, url: str = "", timeout_wait_page: int = 0): if not url: logger.error("Empty URL! Something's wrong!") return @@ -109,6 +113,13 @@ def navigate(self, url: str = ""): self.last_url = url logger.info(f"Navigating to {url=}") self._driver.get(url) + # dinamically wait for page to load its content with a fallback + # of `timeout_wait_page` seconds. + # waiting to see if any visible input tag appears + if timeout_wait_page: + WebDriverWait(self._driver, timeout=timeout_wait_page).until( + EC.visibility_of_any_elements_located((By.TAG_NAME, "input")) + ) @driver_exception_handler def get_page_source(self) -> str: diff --git a/integrations/phishing_analyzers/analyzers/extract_phishing_site.py b/integrations/phishing_analyzers/analyzers/extract_phishing_site.py index a03e0bfa01..060323def6 100644 --- a/integrations/phishing_analyzers/analyzers/extract_phishing_site.py +++ b/integrations/phishing_analyzers/analyzers/extract_phishing_site.py @@ -57,7 +57,7 @@ def analyze_target( window_width=window_width, window_height=window_height, ) - driver_wrapper.navigate(url=target_url) + driver_wrapper.navigate(url=target_url, timeout_wait_page=5) result: str = json.dumps(extract_driver_result(driver_wrapper), default=str) logger.debug(f"JSON dump of driver {result=}")