User-Defined Preprocessing Function
The user-defined preprocessing function should return two elements: features and targets, except for pretrain
problem type.
For features and targets, it can be one of the following format:
- tuple of list
- generator of tuple
Please note that if preprocessing function returns generator of tuple, then corresponding problem cannot be chained using &
.
@preprocessing_fn
def toy_cls(params: Params, mode: str) -> Tuple[list, list]:
"Simple example to demonstrate singe modal tuple of list return"
if mode == m3tl.TRAIN:
toy_input = ['this is a toy input' for _ in range(10)]
toy_target = ['a' for _ in range(10)]
else:
toy_input = ['this is a toy input for test' for _ in range(10)]
toy_target = ['a' for _ in range(10)]
return toy_input, toy_target
@preprocessing_fn
def toy_cls(params: Params, mode: str) -> Tuple[list, list]:
"Simple example to demonstrate multi-modal tuple of list return"
if mode == m3tl.TRAIN:
toy_input = [{'text': 'this is a toy input',
'image': np.random.uniform(size=(16))} for _ in range(10)]
toy_target = ['a' for _ in range(10)]
else:
toy_input = [{'text': 'this is a toy input for test',
'image': np.random.uniform(size=(16))} for _ in range(10)]
toy_target = ['a' for _ in range(10)]
return toy_input, toy_target
# params.register_problem(problem_name='toy_cls', problem_type='cls', processing_fn=toy_cls)
# assert (10, 1)==toy_cls(params=params, mode=m3tl.TRAIN, get_data_num=True, write_tfrecord=False)
# shutil.rmtree(os.path.join(params.tmp_file_dir, 'toy_cls'))
# toy_cls(params=params, mode=m3tl.TRAIN, get_data_num=False, write_tfrecord=True)
# assert os.path.exists(os.path.join(params.tmp_file_dir, 'toy_cls', 'train_feature_desc.json'))
@preprocessing_fn
def toy_cls(params: Params, mode: str) -> Tuple[list, list]:
"Simple example to demonstrate singe modal tuple of list return"
if mode == m3tl.TRAIN:
toy_input = ['this is a toy input' for _ in range(10)]
toy_target = ['a' for _ in range(10)]
else:
toy_input = ['this is a toy input for test' for _ in range(10)]
toy_target = ['a' for _ in range(10)]
for i, t in zip(toy_input, toy_target):
yield i, t
@preprocessing_fn
def toy_cls(params: Params, mode: str) -> Tuple[list, list]:
"Simple example to demonstrate multi-modal tuple of list return"
if mode == m3tl.TRAIN:
toy_input = [{'text': 'this is a toy input',
'image': np.random.uniform(size=(16))} for _ in range(10)]
toy_target = ['a' for _ in range(10)]
else:
toy_input = [{'text': 'this is a toy input for test',
'image': np.random.uniform(size=(16))} for _ in range(10)]
toy_target = ['a' for _ in range(10)]
for i, t in zip(toy_input, toy_target):
yield i, t
@preprocessing_fn
def toy_cls(params: Params, mode: str) -> RDD:
get_or_make_label_encoder(params=params, problem='toy_cls', label_list=['a'], mode=mode)
if mode == m3tl.TRAIN:
d = {
'inputs_text': ['this is a toy input' for _ in range(10)],
'inputs_image': [np.random.uniform(size=(16)).tolist() for _ in range(10)],
'labels': ['a' for _ in range(10)]
}
else:
d = {
'inputs_text': ['this is a toy input test' for _ in range(10)],
'inputs_image': [np.random.uniform(size=(16)).tolist() for _ in range(10)],
'labels': ['a' for _ in range(10)]
}
d = pd.DataFrame(d).to_dict('records')
rdd = sc.parallelize(d)
return rdd
preproc_dec_test()