User-Defined Preprocessing Function
The user-defined preprocessing function should return two elements: features and targets, except for pretrain problem type.
For features and targets, it can be one of the following format:
- tuple of list
 - generator of tuple
 
Please note that if preprocessing function returns generator of tuple, then corresponding problem cannot be chained using &.
@preprocessing_fn
def toy_cls(params: Params, mode: str) -> Tuple[list, list]:
    "Simple example to demonstrate singe modal tuple of list return"
    if mode == m3tl.TRAIN:
        toy_input = ['this is a toy input' for _ in range(10)]
        toy_target = ['a' for _ in range(10)]
    else:
        toy_input = ['this is a toy input for test' for _ in range(10)]
        toy_target = ['a' for _ in range(10)]
    return toy_input, toy_target
@preprocessing_fn
def toy_cls(params: Params, mode: str) -> Tuple[list, list]:
    "Simple example to demonstrate multi-modal tuple of list return"
    if mode == m3tl.TRAIN:
        toy_input = [{'text': 'this is a toy input',
                      'image': np.random.uniform(size=(16))} for _ in range(10)]
        toy_target = ['a' for _ in range(10)]
    else:
        toy_input = [{'text': 'this is a toy input for test',
                      'image': np.random.uniform(size=(16))} for _ in range(10)]
        toy_target = ['a' for _ in range(10)]
    return toy_input, toy_target
# params.register_problem(problem_name='toy_cls', problem_type='cls', processing_fn=toy_cls)
# assert (10, 1)==toy_cls(params=params, mode=m3tl.TRAIN, get_data_num=True, write_tfrecord=False)
# shutil.rmtree(os.path.join(params.tmp_file_dir, 'toy_cls'))
# toy_cls(params=params, mode=m3tl.TRAIN, get_data_num=False, write_tfrecord=True)
# assert os.path.exists(os.path.join(params.tmp_file_dir, 'toy_cls', 'train_feature_desc.json'))
@preprocessing_fn
def toy_cls(params: Params, mode: str) -> Tuple[list, list]:
    "Simple example to demonstrate singe modal tuple of list return"
    if mode == m3tl.TRAIN:
        toy_input = ['this is a toy input' for _ in range(10)]
        toy_target = ['a' for _ in range(10)]
    else:
        toy_input = ['this is a toy input for test' for _ in range(10)]
        toy_target = ['a' for _ in range(10)]
    for i, t in zip(toy_input, toy_target):
        yield i, t
@preprocessing_fn
def toy_cls(params: Params, mode: str) -> Tuple[list, list]:
    "Simple example to demonstrate multi-modal tuple of list return"
    if mode == m3tl.TRAIN:
        toy_input = [{'text': 'this is a toy input',
                      'image': np.random.uniform(size=(16))} for _ in range(10)]
        toy_target = ['a' for _ in range(10)]
    else:
        toy_input = [{'text': 'this is a toy input for test',
                      'image': np.random.uniform(size=(16))} for _ in range(10)]
        toy_target = ['a' for _ in range(10)]
    for i, t in zip(toy_input, toy_target):
        yield i, t
@preprocessing_fn
def toy_cls(params: Params, mode: str) -> RDD:
    get_or_make_label_encoder(params=params, problem='toy_cls', label_list=['a'], mode=mode)
    if mode == m3tl.TRAIN:
        d = {
            'inputs_text': ['this is a toy input' for _ in range(10)],
            'inputs_image': [np.random.uniform(size=(16)).tolist() for _ in range(10)],
            'labels': ['a' for _ in range(10)]
        }
    else:
        d = {
            'inputs_text': ['this is a toy input test' for _ in range(10)],
            'inputs_image': [np.random.uniform(size=(16)).tolist() for _ in range(10)],
            'labels': ['a' for _ in range(10)]
        }
    d = pd.DataFrame(d).to_dict('records')
    rdd = sc.parallelize(d)
    return rdd
preproc_dec_test()