Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Pdf uri extractor and pivoting #2391

Merged
merged 11 commits into from
Jul 1, 2024
59 changes: 56 additions & 3 deletions api_app/analyzers_manager/file_analyzers/pdf_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@
# See the file 'LICENSE' for copying permission.

import logging
import os
from typing import Any, List

import pdfreader
import peepdf
import pikepdf
from pdfid import pdfid

from api_app.analyzers_manager.classes import FileAnalyzer
Expand All @@ -19,14 +22,64 @@ def flatten(list_of_lists: List[List[Any]]) -> List[Any]:
return [item for sublist in list_of_lists for item in sublist]

def run(self):
self.results = {"peepdf": {}, "pdfid": {}}
self.results = {"peepdf": {}, "pdfid": {}, "pdfreader": {}, "uris": []}

# the analysis fails only when BOTH fails
peepdf_success = self.__peepdf_analysis()
pdfid_success = self.__pdfid_analysis()
if not peepdf_success and not pdfid_success:
raise AnalyzerRunException("both peepdf and pdfid failed")
pdfreader_success = self.__pdfreader_analysis()

if not pdfreader_success and not (pdfid_success and peepdf_success):
raise AnalyzerRunException("all readers failed")

# pivot uris in the pdf only if we have one page
if self.results["pdfreader"]["pages"] == 1:
uris = []
for s in self.results["peepdf"]["stats"]:
uris.extend(s["uris"])
uris.extend(self.results["pdfreader"]["uris"])
logger.info(f"{uris=}")
uris = list(set(uris)) # removing duplicates
if uris:
self.results["uris"] = uris

return self.results

def __pdfreader_analysis(self):
self.results["pdfreader"]["uris"] = []
success = True
parser_exception = False

# pre-processing pdf file
doc = pikepdf.Pdf.open(self.filepath)
doc.save(
os.path.join(self.filepath + "_stream_disabled.pdf"),
fix_metadata_version=True,
object_stream_mode=pikepdf.ObjectStreamMode.disable,
)

with open(os.path.join(self.filepath + "_stream_disabled.pdf"), "rb") as fd:
try:
doc = pdfreader.PDFDocument(fd)
except pdfreader.exceptions.ParserException:
parser_exception = True
else:
self.results["pdfreader"]["pages"] = doc.root["Pages"]["Count"]
for page in doc.root["Pages"]["Kids"]:
for annot in page["Annots"]:
if "A" in annot:
if "URI" in annot["A"]:
self.results["pdfreader"]["uris"].append(
annot["A"]["URI"].decode("utf8")
)

if parser_exception and len(self.results["uris"]) == 0:
success = False

os.unlink(os.path.join(self.filepath + "_stream_disabled.pdf"))

return success

def __peepdf_analysis(self):
success = False
peepdf_analysis = {}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
from django.db import migrations
from django.db.models.fields.related_descriptors import (
ForwardManyToOneDescriptor,
ForwardOneToOneDescriptor,
ManyToManyDescriptor,
)

plugin = {
"python_module": {
"health_check_schedule": None,
"update_schedule": None,
"module": "download_file_from_uri.DownloadFileFromUri",
"base_path": "api_app.analyzers_manager.observable_analyzers",
},
"name": "DownloadFileFromUri",
"description": "performs an http request to an uri and download the file through the http proxy",
"disabled": False,
"soft_time_limit": 60,
"routing_key": "default",
"health_check_status": True,
"type": "observable",
"docker_based": False,
"maximum_tlp": "RED",
"observable_supported": ["url"],
"supported_filetypes": [],
"run_hash": False,
"run_hash_type": "",
"not_supported_filetypes": [],
"model": "analyzers_manager.AnalyzerConfig",
}

params = [
{
"python_module": {
"module": "download_file_from_uri.DownloadFileFromUri",
"base_path": "api_app.analyzers_manager.observable_analyzers",
},
"name": "http_proxy",
"type": "str",
"description": "http proxy url",
"is_secret": True,
"required": True,
},
{
"python_module": {
"module": "download_file_from_uri.DownloadFileFromUri",
"base_path": "api_app.analyzers_manager.observable_analyzers",
},
"name": "basefolder",
"type": "str",
"description": "folder where the files will be stored",
"is_secret": False,
"required": True,
},
]

values = [
{
"parameter": {
"python_module": {
"module": "download_file_from_uri.DownloadFileFromUri",
"base_path": "api_app.analyzers_manager.observable_analyzers",
},
"name": "basefolder",
"type": "str",
"description": "folder where the files will be stored",
"is_secret": False,
"required": True,
},
"analyzer_config": "DownloadFileFromUri",
"connector_config": None,
"visualizer_config": None,
"ingestor_config": None,
"pivot_config": None,
"for_organization": False,
"value": "/opt/deploy/files_required/downloaded_files_from_uris/",
"updated_at": "2024-06-19T10:23:03.145744Z",
"owner": None,
}
]


def _get_real_obj(Model, field, value):
def _get_obj(Model, other_model, value):
if isinstance(value, dict):
real_vals = {}
for key, real_val in value.items():
real_vals[key] = _get_real_obj(other_model, key, real_val)
value = other_model.objects.get_or_create(**real_vals)[0]
# it is just the primary key serialized
else:
if isinstance(value, int):
if Model.__name__ == "PluginConfig":
value = other_model.objects.get(name=plugin["name"])
else:
value = other_model.objects.get(pk=value)
else:
value = other_model.objects.get(name=value)
return value

if (
type(getattr(Model, field))
in [ForwardManyToOneDescriptor, ForwardOneToOneDescriptor]
and value
):
other_model = getattr(Model, field).get_queryset().model
value = _get_obj(Model, other_model, value)
elif type(getattr(Model, field)) in [ManyToManyDescriptor] and value:
other_model = getattr(Model, field).rel.model
value = [_get_obj(Model, other_model, val) for val in value]
return value


def _create_object(Model, data):
mtm, no_mtm = {}, {}
for field, value in data.items():
value = _get_real_obj(Model, field, value)
if type(getattr(Model, field)) is ManyToManyDescriptor:
mtm[field] = value
else:
no_mtm[field] = value
try:
o = Model.objects.get(**no_mtm)
except Model.DoesNotExist:
o = Model(**no_mtm)
o.full_clean()
o.save()
for field, value in mtm.items():
attribute = getattr(o, field)
if value is not None:
attribute.set(value)
return False
return True


def migrate(apps, schema_editor):
Parameter = apps.get_model("api_app", "Parameter")
PluginConfig = apps.get_model("api_app", "PluginConfig")
python_path = plugin.pop("model")
Model = apps.get_model(*python_path.split("."))
if not Model.objects.filter(name=plugin["name"]).exists():
exists = _create_object(Model, plugin)
if not exists:
for param in params:
_create_object(Parameter, param)
for value in values:
_create_object(PluginConfig, value)


def reverse_migrate(apps, schema_editor):
python_path = plugin.pop("model")
Model = apps.get_model(*python_path.split("."))
Model.objects.get(name=plugin["name"]).delete()


class Migration(migrations.Migration):
atomic = False
dependencies = [
("api_app", "0062_alter_parameter_python_module"),
("analyzers_manager", "0096_analyzer_config_malprobscan"),
]

operations = [migrations.RunPython(migrate, reverse_migrate)]
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# This file is a part of IntelOwl https://github.com/intelowlproject/IntelOwl
# See the file 'LICENSE' for copying permission.

import logging
import os
import re
import unicodedata
from urllib.parse import unquote, urlparse

import requests

from api_app.analyzers_manager.classes import ObservableAnalyzer

logger = logging.getLogger(__name__)


# https://github.com/django/django/blob/master/django/utils/text.py
def custom_slugify(value, allow_unicode=False):
value = str(value)
if allow_unicode:
value = unicodedata.normalize("NFKC", value)
else:
value = (
unicodedata.normalize("NFKD", value)
.encode("ascii", "ignore")
.decode("ascii")
)
# clear strange chars
return re.sub(r"[\\\"'$&%/#@()]", "", value)


class DownloadFileFromUri(ObservableAnalyzer):
basefolder: str
_http_proxy: str

def run(self):
result = {"errors": [], "stored_in": ""}

if not os.path.exists(self.basefolder):
os.makedirs(self.basefolder)

proxies = {"http": self._http_proxy} if self._http_proxy else {}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/126.0.0.0 Safari/537.36 Edg/125.0.2535.92",
"Content-type": "application/octet-stream",
}

try:
r = requests.get(
self.observable_name,
headers=headers,
proxies=proxies,
allow_redirects=True,
timeout=50,
)
except requests.exceptions.Timeout as e:
result["errors"].append(f"timeout: {e}")
except requests.exceptions.TooManyRedirects as e:
result["errors"].append(f"too many requests: {e}")
except requests.exceptions.HTTPError as e:
result["errors"].append(f"http error: {e}")
except requests.exceptions.ConnectionError as e:
result["errors"].append(f"connection error: {e}")
except requests.exceptions.RequestException as e:
result["errors"].append(f"catastrophic error: {e}")
else:
if filename := custom_slugify(
os.path.basename(urlparse(unquote(self.observable_name)).path)
):
with open(os.path.join(self.basefolder, filename), "wb") as tmp:
tmp.write(r.content)
tmp.flush()
tmp.close()
result["stored_in"] = os.path.join(self.basefolder, filename)
return result
Loading
Loading