Examples

Example usages of ALiPy. You can also download them from the github repository.


            import copy

            from sklearn.datasets import make_classification
            
            from alipy import ToolBox
            
            X, y = make_classification(n_samples=500, n_features=10, n_informative=5, n_redundant=2,
                                       n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0,
                                       hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)
            
            alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')
            
            # Split data
            alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10)
            
            # Use the default Logistic Regression classifier
            model = alibox.get_default_model()
            
            # The cost budget is 50 times querying
            stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 50)
            
            
            def main_loop(alibox, strategy, round):
                # Get the data split of one fold experiment
                train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
                # Get intermediate results saver for one fold experiment
                saver = alibox.get_stateio(round)
            
                # Set initial performance point
                model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
                pred = model.predict(X[test_idx, :])
                accuracy = alibox.calc_performance_metric(y_true=y[test_idx],
                                                          y_pred=pred,
                                                          performance_metric='accuracy_score')
                saver.set_initial_point(accuracy)
            
                # If the stopping criterion is simple, such as query 50 times. Use `for i in range(50):` is ok.
                while not stopping_criterion.is_stop():
                    # Select a subset of Uind according to the query strategy
                    # Passing model=None to use the default model for evaluating the committees' disagreement
                    select_ind = strategy.select(label_index=label_ind, unlabel_index=unlab_ind, batch_size=1)
                    label_ind.update(select_ind)
                    unlab_ind.difference_update(select_ind)
            
                    # Update model and calc performance according to the model you are using
                    model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
                    pred = model.predict(X[test_idx, :])
                    accuracy = alibox.calc_performance_metric(y_true=y[test_idx],
                                                              y_pred=pred,
                                                              performance_metric='accuracy_score')
            
                    # Save intermediate results to file
                    st = alibox.State(select_index=select_ind, performance=accuracy)
                    saver.add_state(st)
            
                    # Passing the current progress to stopping criterion object
                    stopping_criterion.update_information(saver)
                # Reset the progress in stopping criterion object
                stopping_criterion.reset()
                return saver
            
            
            unc_result = []
            qbc_result = []
            eer_result = []
            quire_result = []
            density_result = []
            bmdr_result = []
            spal_result = []
            lal_result = []
            rnd_result = []
            
            _I_have_installed_the_cvxpy = False
            
            for round in range(5):
                train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
            
                # Use pre-defined strategy
                unc = alibox.get_query_strategy(strategy_name="QueryInstanceUncertainty")
                qbc = alibox.get_query_strategy(strategy_name="QueryInstanceQBC")
                eer = alibox.get_query_strategy(strategy_name="QueryExpectedErrorReduction")
                rnd = alibox.get_query_strategy(strategy_name="QueryInstanceRandom")
                quire = alibox.get_query_strategy(strategy_name="QueryInstanceQUIRE", train_idx=train_idx)
                density = alibox.get_query_strategy(strategy_name="QueryInstanceGraphDensity", train_idx=train_idx)
                lal = alibox.get_query_strategy(strategy_name="QueryInstanceLAL", cls_est=10, train_slt=False)
                lal.download_data()
                lal.train_selector_from_file(reg_est=30, reg_depth=5)
            
                unc_result.append(copy.deepcopy(main_loop(alibox, unc, round)))
                qbc_result.append(copy.deepcopy(main_loop(alibox, qbc, round)))
                eer_result.append(copy.deepcopy(main_loop(alibox, eer, round)))
                rnd_result.append(copy.deepcopy(main_loop(alibox, rnd, round)))
                quire_result.append(copy.deepcopy(main_loop(alibox, quire, round)))
                density_result.append(copy.deepcopy(main_loop(alibox, density, round)))
                lal_result.append(copy.deepcopy(main_loop(alibox, lal, round)))
            
                if _I_have_installed_the_cvxpy:
                    bmdr = alibox.get_query_strategy(strategy_name="QueryInstanceBMDR", kernel='rbf')
                    spal = alibox.get_query_strategy(strategy_name="QueryInstanceSPAL", kernel='rbf')
            
                    bmdr_result.append(copy.deepcopy(main_loop(alibox, bmdr, round)))
                    spal_result.append(copy.deepcopy(main_loop(alibox, spal, round)))
            
            analyser = alibox.get_experiment_analyser(x_axis='num_of_queries')
            analyser.add_method(method_name='QBC', method_results=qbc_result)
            analyser.add_method(method_name='Unc', method_results=unc_result)
            analyser.add_method(method_name='EER', method_results=eer_result)
            analyser.add_method(method_name='Random', method_results=rnd_result)
            analyser.add_method(method_name='QUIRE', method_results=quire_result)
            analyser.add_method(method_name='Density', method_results=density_result)
            analyser.add_method(method_name='LAL', method_results=lal_result)
            if _I_have_installed_the_cvxpy:
                analyser.add_method(method_name='BMDR', method_results=bmdr_result)
                analyser.add_method(method_name='SPAL', method_results=spal_result)
            print(analyser)
            analyser.plot_learning_curves(title='Example of alipy', std_area=False)
            

            import copy

            import numpy as np
            from sklearn.datasets import load_iris
            from sklearn.metrics import f1_score
            from sklearn.preprocessing import OneHotEncoder, normalize
            
            from alipy import ToolBox
            from alipy.index.multi_label_tools import get_Xy_in_multilabel
            from alipy.query_strategy.multi_label import *
            
            X, y = load_iris(return_X_y=True)
            X = normalize(X, norm='l2')
            mlb = OneHotEncoder()
            mult_y = mlb.fit_transform(y.reshape((-1, 1)))
            mult_y = np.asarray(mult_y.todense())
            mult_y_for_metric = mult_y.copy()
            
            
            # Or generate a dataset with any sizes
            # X, mult_y = make_multilabel_classification(n_samples=5000, n_features=20, n_classes=5, length=5)
            
            # Since we are using the label ranking model, the label 0 means unknown. we need to
            # set the 0 entries to -1 which means irrelevant.
            mult_y[mult_y == 0] = -1
            
            alibox = ToolBox(X=X, y=mult_y, query_type='PartLabels')
            alibox.split_AL(test_ratio=0.2, initial_label_rate=0.05, all_class=False)
            model = LabelRankingModel() # base model
            
            def main_loop(alibox, round, strategy):
                train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
                # Get intermediate results saver for one fold experiment
                saver = alibox.get_stateio(round)
                # init model
                X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=mult_y, unknown_element=0)
                model.fit(X=X_tr, y=y_tr)
            
                ini_lab_num = len(label_ind)
                # A simple stopping criterion to specify the query budget.
                while len(label_ind) - ini_lab_num <= 120:
                    # query and update
                    if isinstance(strategy, QueryMultiLabelAUDI):
                        # If you are using a label ranking model, pass it to AUDI. It can
                        # avoid re-training a label ranking model inside the algorithm
                        select_labs = strategy.select(label_ind, unlab_ind, model=model)
                    else:
                        select_labs = strategy.select(label_ind, unlab_ind)
                    # use cost to record the amount of queried instance-label pairs
                    if len(select_labs[0]) == 1:
                        cost = mult_y.shape[1]
                    else:
                        cost = len(select_labs)
                    label_ind.update(select_labs)
                    unlab_ind.difference_update(select_labs)
            
                    # train/test
                    X_tr, y_tr, _ = get_Xy_in_multilabel(select_labs, X=X, y=mult_y, unknown_element=0)
                    model.fit(X=X_tr, y=y_tr, is_incremental=True)
                    pres, pred = model.predict(X[test_idx])
                    pred[pred == -1] = 0
                    perf = f1_score(y_true=mult_y_for_metric[test_idx], y_pred=pred, average='micro')
            
                    # save
                    st = alibox.State(select_index=select_labs, performance=perf, cost=cost)
                    saver.add_state(st)
                    saver.save()
            
                return copy.deepcopy(saver)
            
            
            audi_result = []
            quire_result = []
            random_result = []
            mmc_result = []
            adaptive_result = []
            
            for round in range(3):
                # init strategies
                audi = QueryMultiLabelAUDI(X, mult_y)
                quire = QueryMultiLabelQUIRE(X, mult_y, kernel='rbf')
                mmc = QueryMultiLabelMMC(X, mult_y)
                adaptive = QueryMultiLabelAdaptive(X, mult_y)
                random = QueryMultiLabelRandom(select_type='ins')
            
                audi_result.append(main_loop(alibox, round, strategy=audi))
                quire_result.append(main_loop(alibox, round, strategy=quire))
                mmc_result.append(main_loop(alibox, round, strategy=mmc))
                adaptive_result.append(main_loop(alibox, round, strategy=adaptive))
                random_result.append(main_loop(alibox, round, strategy=random))
            
            analyser = alibox.get_experiment_analyser(x_axis='cost')
            analyser.add_method(method_name='AUDI', method_results=audi_result)
            analyser.add_method(method_name='QUIRE', method_results=quire_result)
            analyser.add_method(method_name='RANDOM', method_results=random_result)
            analyser.add_method(method_name='MMC', method_results=mmc_result)
            analyser.add_method(method_name='Adaptive', method_results=adaptive_result)
            analyser.plot_learning_curves(plot_interval=3)  # plot a performance point in every 3 queries of instance-label pairs
            

            import copy
            from sklearn.datasets import make_classification
            from sklearn.linear_model import LogisticRegression
            from alipy.data_manipulate.al_split import split_features
            from alipy.query_strategy.query_features import QueryFeatureAFASMC, QueryFeatureRandom, QueryFeatureStability, \
                AFASMC_mc, IterativeSVD_mc
            from alipy.index import MultiLabelIndexCollection
            from alipy.experiment.stopping_criteria import StoppingCriteria
            from alipy.experiment import StateIO, State, ExperimentAnalyser
            from alipy.metrics import accuracy_score
            from alipy.index import map_whole_index_to_train
            
            # load and split data
            X, y = make_classification(n_samples=800, n_features=20, n_informative=2, n_redundant=2,
                                       n_repeated=0, n_classes=2, n_clusters_per_class=1, weights=None, flip_y=0.01,
                                       hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)
            tr, te, lab, unlab = split_features(feature_matrix=X, test_ratio=0.3, missing_rate=0.5,
                                                split_count=10, saving_path=None)
            
            # Use the default Logistic Regression classifier
            model = LogisticRegression(solver='liblinear')
            
            # The cost budget is 50 times querying
            stopping_criterion = StoppingCriteria('num_of_queries', 50)
            
            AFASMC_result = []
            rand_result =[]
            Stable_result = []
            
            # AFASMC
            for i in range(5):
                train_idx = tr[i]
                test_idx = te[i]
                label_ind = MultiLabelIndexCollection(lab[i], label_size=X.shape[1])
                unlab_ind = MultiLabelIndexCollection(unlab[i], label_size=X.shape[1])
                saver = StateIO(i, train_idx, test_idx, label_ind, unlab_ind)
                strategy = QueryFeatureAFASMC(X=X, y=y, train_idx=train_idx)
            
                while not stopping_criterion.is_stop():
                    # query
                    selected_feature = strategy.select(observed_entries=label_ind, unkonwn_entries=unlab_ind)
            
                    # update index
                    label_ind.update(selected_feature)
                    unlab_ind.difference_update(selected_feature)
            
                    # train/test
                    lab_in_train = map_whole_index_to_train(train_idx, label_ind)
                    X_mc = AFASMC_mc(X=X[train_idx], y=y[train_idx], omega=lab_in_train)
                    model.fit(X_mc, y[train_idx])
                    pred = model.predict(X[test_idx])
                    perf = accuracy_score(y_true=y[test_idx], y_pred=pred)
            
                    # save
                    st = State(select_index=selected_feature, performance=perf)
                    saver.add_state(st)
                    # saver.save()
            
                    stopping_criterion.update_information(saver)
            
                stopping_criterion.reset()
                AFASMC_result.append(copy.deepcopy(saver))
            
            SVD_mc = IterativeSVD_mc(rank=4)
            # Stablility
            for i in range(5):
                train_idx = tr[i]
                test_idx = te[i]
                label_ind = MultiLabelIndexCollection(lab[i], label_size=X.shape[1])
                unlab_ind = MultiLabelIndexCollection(unlab[i], label_size=X.shape[1])
                saver = StateIO(i, train_idx, test_idx, label_ind, unlab_ind)
                strategy = QueryFeatureStability(X=X, y=y, train_idx=train_idx, rank_arr=[4, 6, 8])
            
                while not stopping_criterion.is_stop():
                    # query
                    selected_feature = strategy.select(observed_entries=label_ind, unkonwn_entries=unlab_ind)
            
                    # update index
                    label_ind.update(selected_feature)
                    unlab_ind.difference_update(selected_feature)
            
                    # train/test
                    lab_in_train = map_whole_index_to_train(train_idx, label_ind)
                    X_mc = SVD_mc.impute(X[train_idx], observed_mask=lab_in_train.get_matrix_mask(mat_shape=(len(train_idx), X.shape[1]), sparse=False))
                    model.fit(X_mc, y[train_idx])
                    pred = model.predict(X[test_idx])
                    perf = accuracy_score(y_true=y[test_idx], y_pred=pred)
            
                    # save
                    st = State(select_index=selected_feature, performance=perf)
                    saver.add_state(st)
            
                    stopping_criterion.update_information(saver)
            
                stopping_criterion.reset()
                Stable_result.append(copy.deepcopy(saver))
            
            # rand
            for i in range(5):
                train_idx = tr[i]
                test_idx = te[i]
                label_ind = MultiLabelIndexCollection(lab[i], label_size=X.shape[1])
                unlab_ind = MultiLabelIndexCollection(unlab[i], label_size=X.shape[1])
                saver = StateIO(i, train_idx, test_idx, label_ind, unlab_ind)
                strategy = QueryFeatureRandom()
            
                while not stopping_criterion.is_stop():
                    # query
                    selected_feature = strategy.select(observed_entries=label_ind, unkonwn_entries=unlab_ind)
            
                    # update index
                    label_ind.update(selected_feature)
                    unlab_ind.difference_update(selected_feature)
            
                    # train/test
                    lab_in_train = map_whole_index_to_train(train_idx, label_ind)
                    X_mc = SVD_mc.impute(X[train_idx], observed_mask=lab_in_train.get_matrix_mask(mat_shape=(len(train_idx), X.shape[1]), sparse=False))
                    model.fit(X_mc, y[train_idx])
                    pred = model.predict(X[test_idx])
                    perf = accuracy_score(y_true=y[test_idx], y_pred=pred)
            
                    # save
                    st = State(select_index=selected_feature, performance=perf)
                    saver.add_state(st)
            
                    stopping_criterion.update_information(saver)
            
                stopping_criterion.reset()
                rand_result.append(copy.deepcopy(saver))
            
            analyser = ExperimentAnalyser()
            analyser.add_method(method_results=AFASMC_result, method_name='AFASMC')
            analyser.add_method(method_results=Stable_result, method_name='Stability')
            analyser.add_method(method_results=rand_result, method_name='Random')
            print(analyser)
            analyser.plot_learning_curves()
            

            import numpy as np 
            import copy
            
            from sklearn.datasets import make_multilabel_classification
            from sklearn.svm import SVC
            from sklearn.ensemble import RandomForestClassifier
            
            from alipy import ToolBox
            from alipy.index.multi_label_tools import get_Xy_in_multilabel
            from alipy.query_strategy.cost_sensitive import QueryCostSensitiveHALC, QueryCostSensitivePerformance, QueryCostSensitiveRandom
            from alipy.query_strategy.cost_sensitive import hierarchical_multilabel_mark
            
            # the num of classes of the classification problem
            NUM_CLASS = 5
            NUM_SAMPLES = 2000
            X, y = make_multilabel_classification(n_samples=NUM_SAMPLES, n_features=20, n_classes=NUM_CLASS,
                                               n_labels=3, length=50, allow_unlabeled=True,
                                               sparse=False, return_indicator='dense',
                                               return_distributions=False,
                                               random_state=None)
            y[y == 0] = -1
            # the cost of each class
            cost = [1, 3, 3, 7, 10]
            
            # if node_i is the parent of node_j , then label_tree(i,j)=1 else 0
            label_tree = np.zeros((5,5),dtype=np.int)
            label_tree[0, 1] = 1
            label_tree[0, 2] = 1
            label_tree[1, 3] = 1
            label_tree[2, 4] = 1
            
            alibox = ToolBox(X=X, y=y, query_type='PartLabels')
            
            # Split data
            alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10, all_class=True)
            
            # train one model for each label on dataset
            # the base model using SVC in sklearn
            models = []
            for __ in range(NUM_CLASS):
                models.append(SVC(decision_function_shape='ovr', gamma='auto'))
            # The budget of query
            budget = 40
            
            # The cost budget is 500
            stopping_criterion = alibox.get_stopping_criterion('cost_limit', 500)
            
            performance_result = []
            halc_result = []
            random_result = []
            
            def main_loop(alibox, strategy, round):
                # Get the data split of one fold experiment
                train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
                # Get intermediate results saver for one fold experiment
                saver = alibox.get_stateio(round)
            
                # initalizing the models
                train_traget = label_ind.get_matrix_mask((NUM_SAMPLES, NUM_CLASS), sparse=False)
                for j in np.arange(NUM_CLASS):  
                    j_target = train_traget[:, j]
                    i_samples = np.where(j_target!=0)[0]
                    m = models[j]
                    m.fit(X[i_samples, :], y[i_samples, j])
            
                while not stopping_criterion.is_stop():
                    # Select a subset of Uind according to the query strategy
                    select_ind = strategy.select(label_ind, unlab_ind, cost=cost, budget=budget, models=models)
                    select_ind = hierarchical_multilabel_mark(select_ind, label_ind, label_tree, y)
            
                    label_ind.update(select_ind)
                    unlab_ind.difference_update(select_ind)
                        
                    # Update model and calc performance according to the model you are using
                    train_traget = label_ind.get_matrix_mask((NUM_SAMPLES, NUM_CLASS), sparse=False)
                    for j in np.arange(NUM_CLASS):  
                        j_target = train_traget[:, j]
                        i_samples = np.where(j_target!=0)[0]
                        m = models[j]
                        m.fit(X[i_samples, :], y[i_samples, j])
                    pred = None
                    for j in np.arange(NUM_CLASS):
                        model = models[j]
                        pred_j = model.predict(X[test_idx])
                        if pred is None:
                            pred = pred_j.reshape((len(test_idx), 1))
                        else:
                            pred = np.hstack((pred, pred_j.reshape((len(test_idx), 1))))
            
                    performance = alibox.calc_performance_metric(y_true=y[test_idx], y_pred=pred, performance_metric='hamming_loss')
            
                    # Save intermediate results to file
                    st = alibox.State(select_index=select_ind.index, performance=performance, cost=budget)
                    saver.add_state(st)
                    # Passing the current progress to stopping criterion object
                    stopping_criterion.update_information(saver)
                # Reset the progress in stopping criterion object
                stopping_criterion.reset()
                return saver
            
            for round in range(5):
                train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
                # Use pre-defined strategy
                random = QueryCostSensitiveRandom(X,y)
                perf = QueryCostSensitivePerformance(X, y)
                halc = QueryCostSensitiveHALC(X, y,label_tree=label_tree)
            
                random_result.append(copy.deepcopy(main_loop(alibox, random, round)))
                performance_result.append(copy.deepcopy(main_loop(alibox, perf, round)))
                halc_result.append(copy.deepcopy(main_loop(alibox, halc, round)))
            
            analyser = alibox.get_experiment_analyser(x_axis='cost')
            analyser.add_method(method_name='random', method_results=random_result)
            analyser.add_method(method_name='performance', method_results=performance_result)
            analyser.add_method(method_name='HALC', method_results=halc_result)
            
            print(analyser)
            analyser.plot_learning_curves(title='Example of cost-sensitive', std_area=False)
            

            from alipy.toolbox import ToolBox
            from alipy.oracle import Oracle, Oracles
            from alipy.utils.misc import randperm
            from alipy.query_strategy.noisy_oracles import QueryNoisyOraclesCEAL, QueryNoisyOraclesAll, \
                QueryNoisyOraclesIEthresh, QueryNoisyOraclesRandom, get_majority_vote
            from sklearn.datasets import make_classification
            import copy
            import numpy as np
            
            X, y = make_classification(n_samples=800, n_features=20, n_informative=2, n_redundant=2,
                                       n_repeated=0, n_classes=2, n_clusters_per_class=1, weights=None, flip_y=0.01,
                                       hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)
            
            alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')
            
            # Split data
            alibox.split_AL(test_ratio=0.3, initial_label_rate=0.15, split_count=10)
            
            # Use the default Logistic Regression classifier
            model = alibox.get_default_model()
            
            # The cost budget is 50 times querying
            stopping_criterion = alibox.get_stopping_criterion('cost_limit', 30)
            
            # initialize noisy oracles with different noise level
            n_samples = len(y)
            y1 = y.copy()
            y2 = y.copy()
            y3 = y.copy()
            y4 = y.copy()
            y5 = y.copy()
            perms = randperm(n_samples-1)
            y1[perms[0:round(n_samples*0.1)]] = 1-y1[perms[0:round(n_samples*0.1)]]
            perms = randperm(n_samples-1)
            y2[perms[0:round(n_samples*0.2)]] = 1-y2[perms[0:round(n_samples*0.2)]]
            perms = randperm(n_samples-1)
            y3[perms[0:round(n_samples*0.3)]] = 1-y3[perms[0:round(n_samples*0.3)]]
            perms = randperm(n_samples-1)
            y4[perms[0:round(n_samples*0.4)]] = 1-y4[perms[0:round(n_samples*0.4)]]
            perms = randperm(n_samples-1)
            y5[perms[0:round(n_samples*0.5)]] = 1-y5[perms[0:round(n_samples*0.5)]]
            oracle1 = Oracle(labels=y1, cost=np.zeros(y.shape)+1.2)
            oracle2 = Oracle(labels=y2, cost=np.zeros(y.shape)+.8)
            oracle3 = Oracle(labels=y3, cost=np.zeros(y.shape)+.5)
            oracle4 = Oracle(labels=y4, cost=np.zeros(y.shape)+.4)
            oracle5 = Oracle(labels=y5, cost=np.zeros(y.shape)+.3)
            oracle6 = Oracle(labels=[0]*n_samples, cost=np.zeros(y.shape)+.3)
            oracle7 = Oracle(labels=[1]*n_samples, cost=np.zeros(y.shape)+.3)
            oracles = Oracles()
            oracles.add_oracle(oracle_name='o1', oracle_object=oracle1)
            oracles.add_oracle(oracle_name='o2', oracle_object=oracle2)
            oracles.add_oracle(oracle_name='o3', oracle_object=oracle3)
            oracles.add_oracle(oracle_name='o4', oracle_object=oracle4)
            # oracles.add_oracle(oracle_name='o5', oracle_object=oracle5)
            oracles.add_oracle(oracle_name='oa0', oracle_object=oracle6)
            oracles.add_oracle(oracle_name='oa1', oracle_object=oracle7)
            
            # oracles_list = [oracle1, oracle2]
            
            # def main loop
            def al_loop(strategy, alibox, round):
                # Get the data split of one fold experiment
                train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
                # Get intermediate results saver for one fold experiment
                saver = alibox.get_stateio(round)
                # Get repository to store noisy labels
                repo = alibox.get_repository(round)
            
                while not stopping_criterion.is_stop():
                    # Query
                    select_ind, select_ora = strategy.select(label_ind, unlab_ind)
                    vote_count, vote_result, cost = get_majority_vote(selected_instance=select_ind, oracles=oracles, names=select_ora)
                    repo.update_query(labels=vote_result, indexes=select_ind)
            
                    # update ind
                    label_ind.update(select_ind)
                    unlab_ind.difference_update(select_ind)
            
                    # Train/test
                    _, y_lab, indexes_lab = repo.get_training_data()
                    model.fit(X=X[indexes_lab], y=y_lab)
                    pred = model.predict(X[test_idx])
                    perf = alibox.calc_performance_metric(y_true=y[test_idx], y_pred=pred)
            
                    # save
                    st = alibox.State(select_index=select_ind, performance=perf, cost=cost)
                    saver.add_state(st)
            
                    stopping_criterion.update_information(saver)
            
                stopping_criterion.reset()
                return saver
            
            ceal_result = []
            iet_result = []
            all_result = []
            rand_result = []
            
            for round in range(5):
                train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
                # init strategies
                ceal = QueryNoisyOraclesCEAL(X, y, oracles=oracles, initial_labeled_indexes=label_ind)
                iet = QueryNoisyOraclesIEthresh(X=X, y=y, oracles=oracles, initial_labeled_indexes=label_ind)
                all = QueryNoisyOraclesAll(X=X, y=y, oracles=oracles)
                rand = QueryNoisyOraclesRandom(X=X, y=y, oracles=oracles)
            
                ceal_result.append(copy.deepcopy(al_loop(ceal, alibox, round)))
                iet_result.append(copy.deepcopy(al_loop(iet, alibox, round)))
                all_result.append(copy.deepcopy(al_loop(all, alibox, round)))
                rand_result.append(copy.deepcopy(al_loop(rand, alibox, round)))
            
            print(oracles.full_history())
            analyser = alibox.get_experiment_analyser(x_axis='cost')
            analyser.add_method(method_results=ceal_result, method_name='ceal')
            analyser.add_method(method_results=iet_result, method_name='iet')
            analyser.add_method(method_results=all_result, method_name='all')
            analyser.add_method(method_results=rand_result, method_name='rand')
            analyser.plot_learning_curves()
            

            import copy
            import numpy as np
            from sklearn.datasets import load_iris, make_multilabel_classification
            from sklearn.preprocessing import OneHotEncoder, normalize
            from sklearn.metrics import f1_score
            from alipy.query_strategy.query_type import QueryTypeAURO
            from alipy.query_strategy.multi_label import LabelRankingModel
            from alipy.index.multi_label_tools import get_Xy_in_multilabel
            from alipy import ToolBox
            
            X, y = load_iris(return_X_y=True)
            # X, mult_y = make_multilabel_classification(n_samples=2500, n_labels=3, n_classes=10, n_features=15)
            mlb = OneHotEncoder()
            mult_y = mlb.fit_transform(y.reshape((-1, 1)))
            mult_y = np.asarray(mult_y.todense())
            X = normalize(X, norm='l2')
            mult_y_for_metric = mult_y.copy()
            mult_y[mult_y == 0] = -1
            
            alibox = ToolBox(X=X, y=mult_y, query_type='PartLabels')
            alibox.split_AL(test_ratio=0.2, initial_label_rate=0.05, all_class=False)
            model = LabelRankingModel() # base model
            
            # query type strategy
            AURO_results = []
            
            for round in range(5):
            
                train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
                # Get intermediate results saver for one fold experiment
                saver = alibox.get_stateio(round)
                query_y = mult_y.copy() # for labeling `less relevant`
                AURO_strategy = QueryTypeAURO(X=X, y=mult_y)
                # init model
                X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=mult_y)
                model.fit(X=X_tr, y=y_tr)
            
                for iter in range(100):
            
                    select_ins, select_y1, select_y2 = AURO_strategy.select(label_ind, unlab_ind, model=model, y_mat=query_y)
            
                    # relevance
                    y1 = mult_y[select_ins, select_y1]
                    y2 = mult_y[select_ins, select_y2]
                    if y1 < 0 and y2 < 0:
                        query_y[select_ins, select_y1] = -1
                        query_y[select_ins, select_y2] = -1
                    elif y1 > y2:
                        query_y[select_ins, select_y1] = 1
                        query_y[select_ins, select_y2] = 0.5
                    else:
                        query_y[select_ins, select_y1] = 0.5
                        query_y[select_ins, select_y2] = 1
            
                    # record results
                    label_ind.update([(select_ins, select_y1), (select_ins, select_y2)])
                    unlab_ind.difference_update([(select_ins, select_y1), (select_ins, select_y2)])
            
                    # train/test
                    X_tr, y_tr, _ = get_Xy_in_multilabel([(select_ins, select_y1), (select_ins, select_y2)], X=X, y=query_y, unknown_element=0)
                    model.fit(X=X_tr, y=y_tr, is_incremental=True)
                    pres, pred = model.predict(X[test_idx])
            
                    # using sklearn to calc micro-f1
                    pred[pred == -1] = 0
                    perf = f1_score(y_true=mult_y_for_metric[test_idx], y_pred=pred, average='micro')
            
                    # save
                    st = alibox.State(select_index=[(select_ins, select_y1), (select_ins, select_y2)], performance=perf)
                    saver.add_state(st)
            
            
                AURO_results.append(copy.copy(saver))
            
            analyser = alibox.get_experiment_analyser()
            analyser.add_method(method_name='AURO', method_results=AURO_results)
            analyser.plot_learning_curves(plot_interval=5)
            

                import numpy as np
                from sklearn.linear_model import LogisticRegression
                from alipy.query_strategy import QueryInstanceUncertainty
                from alipy.index import IndexCollection
                from alipy.oracle import MatrixRepository
                
                # Your labeled set
                X_lab = np.random.randn(100, 10)
                y_lab = np.random.randint(low=0, high=2, size=100)
                # The unlabeled pool, the labels of unlabeled data can be anything. The algorithm will not use them.
                X_unlab = np.random.rand(100,10)
                y_place_holder = np.random.randint(low=0, high=2, size=100)
                
                # Initialize a query strategy.
                unc = QueryInstanceUncertainty(X=np.vstack((X_unlab, X_lab)), y=np.hstack((y_place_holder, y_lab)))
                unlab_ind = IndexCollection(np.arange(100))   # Indexes of your unlabeled set for querying
                label_ind = IndexCollection(np.arange(start=100, stop=200))  # Indexes of your labeled set
                labeled_repo = MatrixRepository(examples=X_lab, labels=y_lab, indexes=label_ind)   # Create a repository to store the labeled instances
                
                # Initialize your model
                model = LogisticRegression(solver='liblinear')
                model.fit(X_lab, y_lab)
                
                # Set the stopping criterion
                for i in range(50):
                    # Use a sklearn model to select instances.
                    select_ind = unc.select(label_index=label_ind, unlabel_index=unlab_ind, model=model, batch_size=1)
                    label_ind.update(select_ind)
                    unlab_ind.difference_update(select_ind)
                
                    # Label the selected instance here
                    selected_instance = X_unlab[select_ind]
                    # Replace this line with your own labeling code ! But not always labeling instances with 1.
                    lab_of_ins = 1
                
                    # Add the labeled example to the repo
                    labeled_repo.update_query(labels=lab_of_ins, indexes=select_ind, examples=selected_instance)
                
                    # If you are using your own model, update your model here, and pass it to unc.select()
                    X_tr, y_tr, ind = labeled_repo.get_training_data()
                    model.fit(X_lab, y_lab)
                
                    # Update the label matrix of the query strategy here in just case that the algorithm may use the labels of labeled set
                    unc.y[select_ind] = lab_of_ins
                
                # Display the labeling history
                print(labeled_repo.full_history())
                
                import pickle
                with open('my_labeled_set.pkl', 'wb') as f:
                    pickle.dump(labeled_repo, f)
                

Copyright © 2018, alipy developers (BSD 3 License).