From f931d9d129002f54a495995774ce7384174cef5c Mon Sep 17 00:00:00 2001 From: Jo Booth Date: Wed, 8 Nov 2023 12:19:47 -0500 Subject: [PATCH] fix: LEAP-237: Patch ORM Leak vulnerability in open source (#5012) * fix: LEAP-237: Patch ORM Leak vulnerability in open source * add further detail to docstring about security concern * fix bug where string starts with desc marker --- label_studio/core/settings/base.py | 8 +- label_studio/core/utils/params.py | 18 +- label_studio/data_manager/functions.py | 37 ++- label_studio/data_manager/serializers.py | 44 +++ .../tests/data_manager/api_tasks.tavern.yml | 254 +++++++++++++++++- 5 files changed, 351 insertions(+), 10 deletions(-) diff --git a/label_studio/core/settings/base.py b/label_studio/core/settings/base.py index 26ab1861ae8d..57b8ac9a7227 100644 --- a/label_studio/core/settings/base.py +++ b/label_studio/core/settings/base.py @@ -15,7 +15,7 @@ import re from datetime import timedelta -from label_studio.core.utils.params import get_bool_env +from label_studio.core.utils.params import get_bool_env, get_env_list formatter = 'standard' JSON_LOG = get_bool_env('JSON_LOG', False) @@ -667,3 +667,9 @@ def collect_versions_dummy(**kwargs): REAL_HOSTNAME = os.getenv('HOSTNAME') # we have to use getenv, because we don't use LABEL_STUDIO_ prefix GCS_CLOUD_STORAGE_FORCE_DEFAULT_CREDENTIALS = get_bool_env('GCS_CLOUD_STORAGE_FORCE_DEFAULT_CREDENTIALS', False) PUBLIC_API_DOCS = get_bool_env('PUBLIC_API_DOCS', False) + +# By default, we disallow filters with foreign keys in data manager for security reasons. +# Add to this list (either here in code, or via the env) to allow specific filters that rely on foreign keys. +DATA_MANAGER_FILTER_ALLOWLIST = list( + set(get_env_list('DATA_MANAGER_FILTER_ALLOWLIST') + ['updated_by__active_organization']) +) diff --git a/label_studio/core/utils/params.py b/label_studio/core/utils/params.py index 1b083e308033..af6043538764 100644 --- a/label_studio/core/utils/params.py +++ b/label_studio/core/utils/params.py @@ -1,4 +1,5 @@ import os +from typing import Callable, Optional, Sequence, TypeVar from rest_framework.exceptions import ValidationError @@ -125,16 +126,27 @@ def get_bool_env(key, default): return get_env(key, default, is_bool=True) -def get_env_list_int(key, default=None): +T = TypeVar('T') + + +def get_env_list( + key: str, default: Optional[Sequence[T]] = None, value_transform: Callable[[str], T] = str +) -> Sequence[T]: """ - "1,2,3" in env variable => [1, 2, 3] in python + "foo,bar,baz" in env variable => ["foo", "bar", "baz"] in python. + Use value_transform to convert the strings to any other type. """ value = get_env(key) if not value: if default is None: return [] return default - return [int(el) for el in value.split(',')] + + return [value_transform(el) for el in value.split(',')] + + +def get_env_list_int(key, default=None) -> Sequence[int]: + return get_env_list(key, default=default, value_transform=int) def get_all_env_with_prefix(prefix=None, is_bool=True, default_value=None): diff --git a/label_studio/data_manager/functions.py b/label_studio/data_manager/functions.py index ba738b592398..c0ab3f18ca91 100644 --- a/label_studio/data_manager/functions.py +++ b/label_studio/data_manager/functions.py @@ -2,6 +2,7 @@ """ import logging from collections import OrderedDict +from typing import Tuple from urllib.parse import unquote import ujson as json @@ -337,11 +338,37 @@ def preprocess_filter(_filter, *_): return _filter -def preprocess_field_name(raw_field_name, only_undefined_field=False): - field_name = raw_field_name.replace('filter:', '') - field_name = field_name.replace('tasks:', '') - ascending = False if field_name[0] == '-' else True # detect direction - field_name = field_name[1:] if field_name[0] == '-' else field_name # remove direction +def preprocess_field_name(raw_field_name, only_undefined_field=False) -> Tuple[str, bool]: + """Transform a field name (as specified in the datamanager views endpoint) to + a django ORM field name. Also handle dotted accesses to task.data. + + Edit with care; it's critical that this function not be changed in ways that + introduce vulnerabilities in the vein of the ORM Leak (see #5012). In particular + it is not advisable to use `replace` or other calls that replace all instances + of a string within this function. + + Returns: Django ORM field name: str, Sort is ascending: bool + """ + + field_name = raw_field_name + ascending = True + + # Descending marker `-` may come at the beginning of the string + if field_name.startswith('-'): + ascending = False + field_name = field_name[1:] + + # For security reasons, these must only be removed when they fall at the beginning of the string (or after `-`). + optional_prefixes = ['filter:', 'tasks:'] + for prefix in optional_prefixes: + if field_name.startswith(prefix): + field_name = field_name[len(prefix) :] + + # Descending marker may also come after other prefixes. Double negative is not allowed. + if ascending and field_name.startswith('-'): + ascending = False + field_name = field_name[1:] + if field_name.startswith('data.'): if only_undefined_field: field_name = f'data__{settings.DATA_UNDEFINED_NAME}' diff --git a/label_studio/data_manager/serializers.py b/label_studio/data_manager/serializers.py index 444207764ed8..8451e3279a6c 100644 --- a/label_studio/data_manager/serializers.py +++ b/label_studio/data_manager/serializers.py @@ -4,6 +4,7 @@ import ujson as json from data_manager.models import Filter, FilterGroup, View +from django.conf import settings from django.db import transaction from projects.models import Project from rest_framework import serializers @@ -18,6 +19,49 @@ class Meta: model = Filter fields = '__all__' + def validate_column(self, column: str) -> str: + """ + Ensure that the passed filter expression starts with 'filter:tasks:' and contains + no foreign key traversals. This means either the filter expression contains no '__' + substrings, or that it's the task.data json field that's accessed. + + Users depending on foreign key traversals in views can allowlist them via the + DATA_MANAGER_FILTER_ALLOWLIST setting in the env. + + Edit with care. The validations below are critical for security. + """ + + column_copy = column + + # We may support 'filter:annotations:' in the future, but we don't as of yet. + required_prefix = 'filter:tasks:' + optional_prefix = '-' + + if not column_copy.startswith(required_prefix): + raise serializers.ValidationError(f'Filter "{column}" should start with "{required_prefix}"') + + column_copy = column_copy[len(required_prefix) :] + + if column_copy.startswith(optional_prefix): + column_copy = column_copy[len(optional_prefix) :] + + if column_copy.startswith('data.'): + # Allow underscores if the filter is based on the `task.data` JSONField, because these don't leverage foreign keys. + return column + + # Specific filters relying on foreign keys can be allowlisted + if column_copy in settings.DATA_MANAGER_FILTER_ALLOWLIST: + return column + + # But in general, we don't allow foreign keys + if '__' in column_copy: + raise serializers.ValidationError( + f'"__" is not generally allowed in filters. Consider asking your administrator to add "{column_copy}" ' + 'to DATA_MANAGER_FILTER_ALLOWLIST, but note that some filter expressions may pose a security risk' + ) + + return column + class FilterGroupSerializer(serializers.ModelSerializer): filters = FilterSerializer(many=True) diff --git a/label_studio/tests/data_manager/api_tasks.tavern.yml b/label_studio/tests/data_manager/api_tasks.tavern.yml index 430ccd1f1f5b..14026e830dbe 100644 --- a/label_studio/tests/data_manager/api_tasks.tavern.yml +++ b/label_studio/tests/data_manager/api_tasks.tavern.yml @@ -1124,7 +1124,7 @@ marks: - usefixtures: - django_live_url stages: - + - id: signup type: ref @@ -1479,3 +1479,255 @@ stages: "total_predictions": 0, "total": 1, } + +--- + +# See FilterSerializer; test logic preventing exploit that traverses the ORM +# to leak sensitive data character-by-character +test_name: tasks_api_filter_security_restrictions +strict: false +marks: +- usefixtures: + - django_live_url +stages: + +- id: signup + type: ref + +- id: create_project + name: create_project + request: + data: + title: Test Draft 1 + show_collab_predictions: true + method: POST + url: '{django_live_url}/api/projects' + response: + save: + json: + project_pk: id + created_by: created_by.id + status_code: 201 + +- name: create view with exploit filter should fail + request: + method: POST + url: '{django_live_url}/api/dm/views' + json: + project: '{project_pk}' + data: + filters: + conjunction: and + items: + - filter: "filter:tasks:updated_by__sensitive_field" + operator: empty + value: "true" + type: String + response: + status_code: 400 + json: + validation_errors: + filter_group: + filters: + - column: + - '"__" is not generally allowed in filters. Consider asking your administrator to add "updated_by__sensitive_field" to DATA_MANAGER_FILTER_ALLOWLIST, but note that some filter expressions may pose a security risk' + +- name: create view with allowlisted filter should succeed + request: + method: POST + url: '{django_live_url}/api/dm/views' + json: + project: '{project_pk}' + data: + filters: + conjunction: and + items: + - filter: "filter:tasks:updated_by__active_organization" + operator: empty + value: "true" + type: String + response: + status_code: 201 + save: + json: + view_pk: id + +- name: change filter to typical column succeeds + request: + method: PUT + url: '{django_live_url}/api/dm/views/{view_pk}?interaction=filter&project={project_pk}' + json: + project: '{project_pk}' + data: + filters: + conjunction: and + items: + - filter: "filter:tasks:updated_by" + operator: empty + value: "true" + type: String + response: + status_code: 200 + +- name: change filter to unexpected prefix fails + request: + method: PUT + url: '{django_live_url}/api/dm/views/{view_pk}?interaction=filter&project={project_pk}' + json: + project: '{project_pk}' + data: + filters: + conjunction: and + items: + - filter: "tasks:filter:annotations_results" + operator: empty + value: "true" + type: String + response: + status_code: 400 + json: + validation_errors: + filter_group: + filters: + - column: + - 'Filter "tasks:filter:annotations_results" should start with "filter:tasks:"' + +- name: change filter to include direction marker succeeds + request: + method: PUT + url: '{django_live_url}/api/dm/views/{view_pk}?interaction=filter&project={project_pk}' + json: + project: '{project_pk}' + data: + filters: + conjunction: and + items: + - filter: "filter:tasks:-annotations_results" + operator: empty + value: "true" + type: String + response: + status_code: 200 + +- name: invalid filter with direction marker fails, but suggests correct allowlist + request: + method: PUT + url: '{django_live_url}/api/dm/views/{view_pk}?interaction=filter&project={project_pk}' + json: + project: '{project_pk}' + data: + filters: + conjunction: and + items: + - filter: "filter:tasks:-updated_by__sensitive_field" + operator: empty + value: "true" + type: String + response: + status_code: 400 + json: + validation_errors: + filter_group: + filters: + - column: + - '"__" is not generally allowed in filters. Consider asking your administrator to add "updated_by__sensitive_field" to DATA_MANAGER_FILTER_ALLOWLIST, but note that some filter expressions may pose a security risk' + +- name: change filter to dotted data field with underscores succeeds + request: + method: PUT + url: '{django_live_url}/api/dm/views/{view_pk}?interaction=filter&project={project_pk}' + json: + project: '{project_pk}' + data: + filters: + conjunction: and + items: + - filter: "filter:tasks:data.images__0" + operator: empty + value: "true" + type: String + response: + status_code: 200 + +- name: change filter to valid column without prefix fails + request: + method: PUT + url: '{django_live_url}/api/dm/views/{view_pk}?interaction=filter&project={project_pk}' + json: + project: '{project_pk}' + data: + filters: + conjunction: and + items: + - filter: "annotations_results" + operator: empty + value: "true" + type: String + response: + status_code: 400 + json: + validation_errors: + filter_group: + filters: + - column: + - 'Filter "annotations_results" should start with "filter:tasks:"' + +--- + +# Regression test preventing replace-all of prefixes from being used to construct exploit filters +test_name: tasks_api_prevent_prefix_regression +strict: false +marks: +- usefixtures: + - django_live_url +stages: + +- id: signup + type: ref + +- id: create_project + name: create_project + request: + data: + title: Test Draft 1 + show_collab_predictions: true + method: POST + url: '{django_live_url}/api/projects' + response: + save: + json: + project_pk: id + created_by: created_by.id + status_code: 201 + +# Perhaps our validation should be further improved, but for now, it's +# garbage in, garbage out. +- name: create view with prefix in middle of filter should succeed + request: + method: POST + url: '{django_live_url}/api/dm/views' + json: + project: '{project_pk}' + data: + filters: + conjunction: and + items: + # in an earlier version, preprocess_field_name would have + # transformed this to updated_by__sensitive_field + - filter: "filter:tasks:updated_by_tasks:_sensitive_field" + operator: empty + value: "true" + type: String + response: + status_code: 201 + save: + json: + view_pk: id + +- name: get_tasks fails because "updated_by_tasks:_sensitive_field" doesn't resolve to a field + request: + method: GET + url: '{django_live_url}/api/tasks?view={view_pk}' + response: + # 500 actually expected here + status_code: 500