Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Add protect method for feature order in fl-xgb #497

Merged
merged 24 commits into from
Jan 31, 2023
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions federatedscope/autotune/baseline/fedhpo_vfl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ model:
train:
optimizer:
lr: 0.5
bin_num: 100
# learning rate for xgb model
eta: 0.5
data:
Expand All @@ -36,7 +35,6 @@ vertical:
key_size: 256
dims: [7, 14]
algo: 'xgb'
xgb_use_bin: False
eval:
freq: 5
best_res_update_round_wise_key: test_loss
Expand Down
11 changes: 8 additions & 3 deletions federatedscope/core/auxiliaries/trainer_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
"mftrainer": "MFTrainer",
"cltrainer": "CLTrainer",
"lptrainer": "LPTrainer",
"verticaltrainer": "VerticalTrainer",
"atc_trainer": "ATCTrainer",
}

Expand Down Expand Up @@ -135,8 +134,6 @@ def get_trainer(model=None,
dict_path = "federatedscope.cv.trainer.trainer"
elif config.trainer.type.lower() in ['nlptrainer']:
dict_path = "federatedscope.nlp.trainer.trainer"
elif config.trainer.type.lower() in ['verticaltrainer']:
dict_path = "federatedscope.vertical_fl.trainer.trainer"
elif config.trainer.type.lower() in ['cltrainer', 'lptrainer']:
dict_path = "federatedscope.cl.trainer.trainer"
elif config.trainer.type.lower() in [
Expand Down Expand Up @@ -171,6 +168,14 @@ def get_trainer(model=None,
config=config,
only_for_eval=only_for_eval,
monitor=monitor)
elif config.trainer.type.lower() in ['verticaltrainer']:
from federatedscope.vertical_fl.trainer.utils import \
get_vertical_trainer
trainer = get_vertical_trainer(config=config,
model=model,
data=data,
device=device,
monitor=monitor)
else:
# try to find user registered trainer
trainer = None
Expand Down
5 changes: 4 additions & 1 deletion federatedscope/core/configs/cfg_fl_setting.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,10 @@ def extend_fl_setting_cfg(cfg):
cfg.vertical.encryption = 'paillier'
cfg.vertical.key_size = 3072
cfg.vertical.algo = 'lr' # ['lr', 'xgb']
cfg.vertical.xgb_use_bin = False
cfg.vertical.protect_object = '' # feature_order, TODO: add more
cfg.vertical.protect_method = '' # dp
cfg.vertical.protect_args = []
# Default values for 'dp': {'bucket_num':100, 'epsilon':None}

# --------------- register corresponding check function ----------
cfg.register_cfg_check_fun(assert_fl_setting_cfg)
Expand Down
1 change: 1 addition & 0 deletions federatedscope/vertical_fl/dataset/blog.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ def _get_data(self):

def _read_raw(self, file_path):
data = pd.read_csv(file_path, header=None, usecols=list(range(281)))
data = data.fillna(method='ffill')
data = data.values
return data

Expand Down
1 change: 1 addition & 0 deletions federatedscope/vertical_fl/dataset/credit.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def balance_sample(sample_size, y):

def _read_raw(self, file_path):
data = pd.read_csv(file_path)
data = data.fillna(method='ffill')
data = data.values
return data

Expand Down
3 changes: 3 additions & 0 deletions federatedscope/vertical_fl/trainer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from federatedscope.vertical_fl.trainer.trainer import VerticalTrainer
from federatedscope.vertical_fl.trainer.feature_order_protected_trainer \
import FeatureOrderProtectedTrainer
235 changes: 235 additions & 0 deletions federatedscope/vertical_fl/trainer/feature_order_protected_trainer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
import numpy as np
from federatedscope.vertical_fl.trainer.trainer import VerticalTrainer


class FeatureOrderProtectedTrainer(VerticalTrainer):
def __init__(self, model, data, device, config, monitor):
super(FeatureOrderProtectedTrainer,
self).__init__(model, data, device, config, monitor)

assert config.vertical.protect_method != '', \
"Please specify the adopted method for protecting feature order"
args = config.vertical.protect_args[0] if len(
config.vertical.protect_args) > 0 else {}

if config.vertical.protect_method == 'dp':
self.bucket_num = args.get('bucket_num', 100)
self.epsilon = args.get('epsilon', None)
self.protect_funcs = self._protect_via_dp
self.split_value = None
elif config.vertical.protect_method == 'op_boost':
self.algo = args.get('algo', 'global')
self.protect_funcs = self._protect_via_op_boost
self.lower_bound = args.get('lower_bound', 1)
self.upper_bound = args.get('upper_bound', 100)
if self.algo == 'global':
self.epsilon = args.get('epsilon', 2)
elif self.algo == 'adjusting':
self.epsilon_prt = args.get('epsilon_prt', 2)
self.epsilon_ner = args.get('epsilon_ner', 2)
self.partition_num = args.get('partition_num', 10)
else:
raise ValueError
else:
raise ValueError(f"The method {args['method']} is not provided")

def get_feature_value(self, feature_idx, value_idx):
if not hasattr(self, 'split_value') or self.split_value is None:
return super().get_feature_value(feature_idx=feature_idx,
value_idx=value_idx)

return self.split_value[feature_idx][value_idx]

def _bucketize(self, feature_order, bucket_size, bucket_num):
bucketized_feature_order = list()
for bucket_idx in range(bucket_num):
start = bucket_idx * bucket_size
end = min((bucket_idx + 1) * bucket_size, len(feature_order))
bucketized_feature_order.append(feature_order[start:end])
return bucketized_feature_order

def _processed_data(self, data):
min_value = np.min(data, axis=0)
max_value = np.max(data, axis=0)
# To avoid data_max[i] == data_min[i],
for i in range(data.shape[1]):
if max_value[i] == min_value[i]:
max_value[i] += 1
return np.round(self.lower_bound + (data - min_value) /
(max_value - min_value) *
(self.upper_bound - self.lower_bound))

def _global_mapping_fun(self, x, epsilon, lower_bound, upper_bound):
probs = list()
denominator = np.sum(
np.exp(-np.abs(x - np.array(range(lower_bound, upper_bound + 1))) *
epsilon / 2))
for k in range(lower_bound, upper_bound + 1):
probs.append(np.exp(-np.abs(x - k) * epsilon / 2) / denominator)
res = np.random.choice(list(range(lower_bound, upper_bound + 1)),
p=probs)

return res

def _adjusting_mapping_fun(self, x, partition_edges):
for part_idx in range(self.partition_num):
if partition_edges[part_idx] < x and partition_edges[part_idx +
1] >= x:
selected_part = self._global_mapping_fun(
part_idx,
epsilon=self.epsilon_prt,
lower_bound=0,
upper_bound=self.partition_num - 1)
res = self._global_mapping_fun(
x,
epsilon=self.epsilon_ner,
lower_bound=partition_edges[selected_part] + 1,
upper_bound=partition_edges[selected_part + 1])

return res

def _op_boost_global(self, data):

processed_data = self._processed_data(data=data)
mapped_data = np.vectorize(self._global_mapping_fun)(
processed_data,
epsilon=self.epsilon,
lower_bound=self.lower_bound,
upper_bound=self.upper_bound)

return mapped_data

def _op_boost_adjusting(self, data):

processed_data = self._processed_data(data=data)
quantiles = np.linspace(0, 100, self.partition_num + 1)
partition_edges = np.round(
np.asarray(
np.percentile(
list(range(self.lower_bound - 1, self.upper_bound + 1)),
quantiles)))
partition_edges = [int(x) for x in partition_edges]
mapped_data = np.vectorize(self._adjusting_mapping_fun,
signature='(),(n)->()')(
processed_data,
partition_edges=partition_edges)

return mapped_data

def _protect_via_op_boost(self, raw_feature_order, data):
"""
Add random noises to feature order for privacy protection.
For more details, please see
OpBoost- A Vertical Federated Tree Boosting Framework Based on
xieyxclack marked this conversation as resolved.
Show resolved Hide resolved
Order-Preserving Desensitization.pdf
(https://arxiv.org/pdf/2210.01318.pdf)
"""
if self.algo == 'global':
mapped_data = self._op_boost_global(data)
elif self.algo == 'adjusting':
mapped_data = self._op_boost_adjusting(data)
else:
mapped_data = None
assert mapped_data is not None

# Get feature order based on mapped data
num_of_feature = mapped_data.shape[1]
protected_feature_order = [0] * num_of_feature
for i in range(num_of_feature):
protected_feature_order[i] = mapped_data[:, i].argsort()

return {
'raw_feature_order': raw_feature_order,
'feature_order': protected_feature_order,
}

def _protect_via_dp(self, raw_feature_order, data):
"""
Bucketize and add dp noise to feature order for privacy protection.
For more details, please refer to
FederBoost: Private Federated Learning for GBDT
(https://arxiv.org/pdf/2011.02796.pdf)
"""
protected_feature_order = list()
bucket_size = int(
np.ceil(self.cfg.dataloader.batch_size / self.bucket_num))
if self.epsilon is None:
prob_for_preserving = 1.0
else:
_tmp = np.power(np.e, self.epsilon)
prob_for_preserving = _tmp / (_tmp + self.bucket_num - 1)
prob_for_moving = (1.0 - prob_for_preserving) / (self.bucket_num - 1)
split_position = []
self.split_value = []

for feature_idx in range(len(raw_feature_order)):
bucketized_feature_order = self._bucketize(
raw_feature_order[feature_idx], bucket_size, self.bucket_num)
noisy_bucketizd_feature_order = [[]
for _ in range(self.bucket_num)]

# Add noise to bucketized feature order
for bucket_idx in range(self.bucket_num):
probs = np.ones(self.bucket_num) * prob_for_moving
probs[bucket_idx] = prob_for_preserving
for each in bucketized_feature_order[bucket_idx]:
selected_bucket_idx = np.random.choice(list(
range(self.bucket_num)),
p=probs)
noisy_bucketizd_feature_order[selected_bucket_idx].append(
each)

# Save split positions (instance number within buckets)
# We exclude the endpoints to avoid empty sub-trees
_split_position = list()
_split_value = dict()
accumu_num = 0
for bucket_idx, each_bucket in enumerate(
noisy_bucketizd_feature_order):
instance_num = len(each_bucket)
# Skip the empty bucket
if instance_num != 0:
# Skip the endpoints
if bucket_idx != self.bucket_num - 1:
_split_position.append(accumu_num + instance_num)

# Save split values: average of min value of (j-1)-th
# bucket and max value of j-th bucket
xieyxclack marked this conversation as resolved.
Show resolved Hide resolved
max_value = data[bucketized_feature_order[bucket_idx]
[-1]][feature_idx]
min_value = data[bucketized_feature_order[bucket_idx]
[0]][feature_idx]
if accumu_num == 0:
_split_value[accumu_num +
instance_num] = min_value / 2.0
elif bucket_idx == self.bucket_num - 1:
_split_value[accumu_num] += max_value / 2.0
else:
_split_value[accumu_num] += max_value / 2.0
_split_value[accumu_num +
instance_num] = min_value / 2.0

accumu_num += instance_num

split_position.append(_split_position)
self.split_value.append(_split_value)

[np.random.shuffle(x) for x in noisy_bucketizd_feature_order]
noisy_bucketizd_feature_order = np.concatenate(
noisy_bucketizd_feature_order)
protected_feature_order.append(noisy_bucketizd_feature_order)

extra_info = {'split_position': split_position}

return {
'raw_feature_order': raw_feature_order,
'feature_order': protected_feature_order,
'extra_info': extra_info
}

def _get_feature_order_info(self, data):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For convenience, I also protected the label owner's feature order before. Actually, label owner does not need to do this.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since it need some more efforts to fix this issue such as modifying the split position accordingly, we can add TODO item here and fix it later

num_of_feature = data.shape[1]
feature_order = [0] * num_of_feature
for i in range(num_of_feature):
feature_order[i] = data[:, i].argsort()
return self.protect_funcs(feature_order, data)
Loading