Skip to content

Commit

Permalink
fix: LEAP-237: Patch ORM Leak vulnerability in open source (#5012)
Browse files Browse the repository at this point in the history
* fix: LEAP-237: Patch ORM Leak vulnerability in open source

* add further detail to docstring about security concern

* fix bug where string starts with desc marker
  • Loading branch information
jombooth committed Nov 8, 2023
1 parent de252f5 commit f931d9d
Show file tree
Hide file tree
Showing 5 changed files with 351 additions and 10 deletions.
8 changes: 7 additions & 1 deletion label_studio/core/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import re
from datetime import timedelta

from label_studio.core.utils.params import get_bool_env
from label_studio.core.utils.params import get_bool_env, get_env_list

formatter = 'standard'
JSON_LOG = get_bool_env('JSON_LOG', False)
Expand Down Expand Up @@ -667,3 +667,9 @@ def collect_versions_dummy(**kwargs):
REAL_HOSTNAME = os.getenv('HOSTNAME') # we have to use getenv, because we don't use LABEL_STUDIO_ prefix
GCS_CLOUD_STORAGE_FORCE_DEFAULT_CREDENTIALS = get_bool_env('GCS_CLOUD_STORAGE_FORCE_DEFAULT_CREDENTIALS', False)
PUBLIC_API_DOCS = get_bool_env('PUBLIC_API_DOCS', False)

# By default, we disallow filters with foreign keys in data manager for security reasons.
# Add to this list (either here in code, or via the env) to allow specific filters that rely on foreign keys.
DATA_MANAGER_FILTER_ALLOWLIST = list(
set(get_env_list('DATA_MANAGER_FILTER_ALLOWLIST') + ['updated_by__active_organization'])
)
18 changes: 15 additions & 3 deletions label_studio/core/utils/params.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from typing import Callable, Optional, Sequence, TypeVar

from rest_framework.exceptions import ValidationError

Expand Down Expand Up @@ -125,16 +126,27 @@ def get_bool_env(key, default):
return get_env(key, default, is_bool=True)


def get_env_list_int(key, default=None):
T = TypeVar('T')


def get_env_list(
key: str, default: Optional[Sequence[T]] = None, value_transform: Callable[[str], T] = str
) -> Sequence[T]:
"""
"1,2,3" in env variable => [1, 2, 3] in python
"foo,bar,baz" in env variable => ["foo", "bar", "baz"] in python.
Use value_transform to convert the strings to any other type.
"""
value = get_env(key)
if not value:
if default is None:
return []
return default
return [int(el) for el in value.split(',')]

return [value_transform(el) for el in value.split(',')]


def get_env_list_int(key, default=None) -> Sequence[int]:
return get_env_list(key, default=default, value_transform=int)


def get_all_env_with_prefix(prefix=None, is_bool=True, default_value=None):
Expand Down
37 changes: 32 additions & 5 deletions label_studio/data_manager/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"""
import logging
from collections import OrderedDict
from typing import Tuple
from urllib.parse import unquote

import ujson as json
Expand Down Expand Up @@ -337,11 +338,37 @@ def preprocess_filter(_filter, *_):
return _filter


def preprocess_field_name(raw_field_name, only_undefined_field=False):
field_name = raw_field_name.replace('filter:', '')
field_name = field_name.replace('tasks:', '')
ascending = False if field_name[0] == '-' else True # detect direction
field_name = field_name[1:] if field_name[0] == '-' else field_name # remove direction
def preprocess_field_name(raw_field_name, only_undefined_field=False) -> Tuple[str, bool]:
"""Transform a field name (as specified in the datamanager views endpoint) to
a django ORM field name. Also handle dotted accesses to task.data.
Edit with care; it's critical that this function not be changed in ways that
introduce vulnerabilities in the vein of the ORM Leak (see #5012). In particular
it is not advisable to use `replace` or other calls that replace all instances
of a string within this function.
Returns: Django ORM field name: str, Sort is ascending: bool
"""

field_name = raw_field_name
ascending = True

# Descending marker `-` may come at the beginning of the string
if field_name.startswith('-'):
ascending = False
field_name = field_name[1:]

# For security reasons, these must only be removed when they fall at the beginning of the string (or after `-`).
optional_prefixes = ['filter:', 'tasks:']
for prefix in optional_prefixes:
if field_name.startswith(prefix):
field_name = field_name[len(prefix) :]

# Descending marker may also come after other prefixes. Double negative is not allowed.
if ascending and field_name.startswith('-'):
ascending = False
field_name = field_name[1:]

if field_name.startswith('data.'):
if only_undefined_field:
field_name = f'data__{settings.DATA_UNDEFINED_NAME}'
Expand Down
44 changes: 44 additions & 0 deletions label_studio/data_manager/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import ujson as json
from data_manager.models import Filter, FilterGroup, View
from django.conf import settings
from django.db import transaction
from projects.models import Project
from rest_framework import serializers
Expand All @@ -18,6 +19,49 @@ class Meta:
model = Filter
fields = '__all__'

def validate_column(self, column: str) -> str:
"""
Ensure that the passed filter expression starts with 'filter:tasks:' and contains
no foreign key traversals. This means either the filter expression contains no '__'
substrings, or that it's the task.data json field that's accessed.
Users depending on foreign key traversals in views can allowlist them via the
DATA_MANAGER_FILTER_ALLOWLIST setting in the env.
Edit with care. The validations below are critical for security.
"""

column_copy = column

# We may support 'filter:annotations:' in the future, but we don't as of yet.
required_prefix = 'filter:tasks:'
optional_prefix = '-'

if not column_copy.startswith(required_prefix):
raise serializers.ValidationError(f'Filter "{column}" should start with "{required_prefix}"')

column_copy = column_copy[len(required_prefix) :]

if column_copy.startswith(optional_prefix):
column_copy = column_copy[len(optional_prefix) :]

if column_copy.startswith('data.'):
# Allow underscores if the filter is based on the `task.data` JSONField, because these don't leverage foreign keys.
return column

# Specific filters relying on foreign keys can be allowlisted
if column_copy in settings.DATA_MANAGER_FILTER_ALLOWLIST:
return column

# But in general, we don't allow foreign keys
if '__' in column_copy:
raise serializers.ValidationError(
f'"__" is not generally allowed in filters. Consider asking your administrator to add "{column_copy}" '
'to DATA_MANAGER_FILTER_ALLOWLIST, but note that some filter expressions may pose a security risk'
)

return column


class FilterGroupSerializer(serializers.ModelSerializer):
filters = FilterSerializer(many=True)
Expand Down
Loading

0 comments on commit f931d9d

Please # to comment.