Examples

Example usages of ALiPy. You can also download them from the github repository.

import copy

from sklearn.datasets import make_classification

from alipy import ToolBox

X, y = make_classification(n_samples=500, n_features=20, n_informative=2, n_redundant=2,
                           n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0,
                           hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)

alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')

# Split data
alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10)

# Use the default Logistic Regression classifier
model = alibox.get_default_model()

# The cost budget is 50 times querying
stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 50)


def main_loop(alibox, strategy, round):
    # Get the data split of one fold experiment
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)

    # Set initial performance point
    model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
    pred = model.predict(X[test_idx, :])
    accuracy = alibox.calc_performance_metric(y_true=y[test_idx],
                                              y_pred=pred,
                                              performance_metric='accuracy_score')
    saver.set_initial_point(accuracy)

    # If the stopping criterion is simple, such as query 50 times. Use `for i in range(50):` is ok.
    while not stopping_criterion.is_stop():
        # Select a subset of Uind according to the query strategy
        # Passing model=None to use the default model for evaluating the committees' disagreement
        select_ind = strategy.select(label_index=label_ind, unlabel_index=unlab_ind, batch_size=1)
        label_ind.update(select_ind)
        unlab_ind.difference_update(select_ind)

        # Update model and calc performance according to the model you are using
        model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
        pred = model.predict(X[test_idx, :])
        accuracy = alibox.calc_performance_metric(y_true=y[test_idx],
                                                  y_pred=pred,
                                                  performance_metric='accuracy_score')

        # Save intermediate results to file
        st = alibox.State(select_index=select_ind, performance=accuracy)
        saver.add_state(st)

        # Passing the current progress to stopping criterion object
        stopping_criterion.update_information(saver)
    # Reset the progress in stopping criterion object
    stopping_criterion.reset()
    return saver


unc_result = []
qbc_result = []
eer_result = []
quire_result = []
density_result = []
bmdr_result = []
spal_result = []
lal_result = []
rnd_result = []

_I_have_installed_the_cvxpy = False

for round in range(5):
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)

    # Use pre-defined strategy
    unc = alibox.get_query_strategy(strategy_name="QueryInstanceUncertainty")
    qbc = alibox.get_query_strategy(strategy_name="QueryInstanceQBC")
    eer = alibox.get_query_strategy(strategy_name="QureyExpectedErrorReduction")
    rnd = alibox.get_query_strategy(strategy_name="QueryInstanceRandom")
    quire = alibox.get_query_strategy(strategy_name="QueryInstanceQUIRE", train_idx=train_idx)
    density = alibox.get_query_strategy(strategy_name="QueryInstanceGraphDensity", train_idx=train_idx)
    lal = alibox.get_query_strategy(strategy_name="QueryInstanceLAL", cls_est=10, train_slt=False)
    lal.download_data()
    lal.train_selector_from_file(reg_est=30, reg_depth=5)

    unc_result.append(copy.deepcopy(main_loop(alibox, unc, round)))
    qbc_result.append(copy.deepcopy(main_loop(alibox, qbc, round)))
    eer_result.append(copy.deepcopy(main_loop(alibox, eer, round)))
    rnd_result.append(copy.deepcopy(main_loop(alibox, rnd, round)))
    quire_result.append(copy.deepcopy(main_loop(alibox, quire, round)))
    density_result.append(copy.deepcopy(main_loop(alibox, density, round)))
    lal_result.append(copy.deepcopy(main_loop(alibox, lal, round)))

    if _I_have_installed_the_cvxpy:
        bmdr = alibox.get_query_strategy(strategy_name="QueryInstanceBMDR", kernel='linear')
        spal = alibox.get_query_strategy(strategy_name="QueryInstanceSPAL", kernel='linear')

        bmdr_result.append(copy.deepcopy(main_loop(alibox, bmdr, round)))
        spal_result.append(copy.deepcopy(main_loop(alibox, spal, round)))

analyser = alibox.get_experiment_analyser(x_axis='num_of_queries')
analyser.add_method(method_name='QBC', method_results=qbc_result)
analyser.add_method(method_name='Unc', method_results=unc_result)
analyser.add_method(method_name='EER', method_results=eer_result)
analyser.add_method(method_name='Random', method_results=rnd_result)
analyser.add_method(method_name='QUIRE', method_results=quire_result)
analyser.add_method(method_name='Density', method_results=density_result)
analyser.add_method(method_name='LAL', method_results=lal_result)
if _I_have_installed_the_cvxpy:
    analyser.add_method(method_name='BMDR', method_results=bmdr_result)
    analyser.add_method(method_name='SPAL', method_results=spal_result)
print(analyser)
analyser.plot_learning_curves(title='Example of alipy', std_area=False)
import copy

import numpy as np
from sklearn.datasets import load_iris
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder, normalize

from alipy import ToolBox
from alipy.index.multi_label_tools import get_Xy_in_multilabel
from alipy.query_strategy.multi_label import *

X, y = load_iris(return_X_y=True)
X = normalize(X, norm='l2')
mlb = OneHotEncoder()
mult_y = mlb.fit_transform(y.reshape((-1, 1)))
mult_y = np.asarray(mult_y.todense())
mult_y_for_metric = mult_y.copy()

# Or generate a dataset with any sizes
# X, mult_y = make_multilabel_classification(n_samples=5000, n_features=20, n_classes=5, length=5)

# Since we are using the label ranking model, the label 0 means unknown. we need to
# set the 0 entries to -1 which means irrelevant.
mult_y[mult_y == 0] = -1

alibox = ToolBox(X=X, y=mult_y, query_type='PartLabels')
alibox.split_AL(test_ratio=0.2, initial_label_rate=0.05, all_class=False)
model = LabelRankingModel() # base model

def main_loop(alibox, round, strategy):
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)
    # init model
    X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=mult_y, unknown_element=0)
    model.fit(X=X_tr, y=y_tr)

    ini_lab_num = len(label_ind)
    # A simple stopping criterion to specify the query budget.
    while len(label_ind) - ini_lab_num <= 120:
        # query and update
        if isinstance(strategy, QueryMultiLabelAUDI):
            # If you are using a label ranking model, pass it to AUDI. It can
            # avoid re-training a label ranking model inside the algorithm
            select_labs = strategy.select(label_ind, unlab_ind, model=model)
        else:
            select_labs = strategy.select(label_ind, unlab_ind)
        # use cost to record the amount of queried instance-label pairs
        if len(select_labs[0]) == 1:
            cost = mult_y.shape[1]
        else:
            cost = len(select_labs)
        label_ind.update(select_labs)
        unlab_ind.difference_update(select_labs)

        # train/test
        X_tr, y_tr, _ = get_Xy_in_multilabel(select_labs, X=X, y=mult_y, unknown_element=0)
        model.fit(X=X_tr, y=y_tr, is_incremental=True)
        pres, pred = model.predict(X[test_idx])
        # using sklearn to calc micro-f1
        pred[pred == -1] = 0
        perf = f1_score(y_true=mult_y_for_metric[test_idx], y_pred=pred, average='micro')

        # save
        st = alibox.State(select_index=select_labs, performance=perf, cost=cost)
        saver.add_state(st)
        saver.save()

    return copy.deepcopy(saver)


audi_result = []
quire_result = []
random_result = []
mmc_result = []
adaptive_result = []

for round in range(3):
    # init strategies
    audi = QueryMultiLabelAUDI(X, mult_y)
    quire = QueryMultiLabelQUIRE(X, mult_y, kernel='rbf')
    mmc = QueryMultiLabelMMC(X, mult_y)
    adaptive = QueryMultiLabelAdaptive(X, mult_y)
    random = QueryMultiLabelRandom(select_type='ins')

    audi_result.append(main_loop(alibox, round, strategy=audi))
    quire_result.append(main_loop(alibox, round, strategy=quire))
    mmc_result.append(main_loop(alibox, round, strategy=mmc))
    adaptive_result.append(main_loop(alibox, round, strategy=adaptive))
    random_result.append(main_loop(alibox, round, strategy=random))

analyser = alibox.get_experiment_analyser(x_axis='cost')
analyser.add_method(method_name='AUDI', method_results=audi_result)
analyser.add_method(method_name='QUIRE', method_results=quire_result)
analyser.add_method(method_name='RANDOM', method_results=random_result)
analyser.add_method(method_name='MMC', method_results=mmc_result)
analyser.add_method(method_name='Adaptive', method_results=adaptive_result)
analyser.plot_learning_curves(plot_interval=3)  # plot a performance point in every 3 queries of instance-label pairs
import copy
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from alipy.data_manipulate.al_split import split_features
from alipy.query_strategy.query_features import QueryFeatureAFASMC, QueryFeatureRandom, QueryFeatureStability, \
    AFASMC_mc, IterativeSVD_mc
from alipy.index import MultiLabelIndexCollection
from alipy.experiment.stopping_criteria import StoppingCriteria
from alipy.experiment import StateIO, State, ExperimentAnalyser
from alipy.metrics import accuracy_score
from alipy.index import map_whole_index_to_train

# load and split data
X, y = make_classification(n_samples=800, n_features=20, n_informative=2, n_redundant=2,
                           n_repeated=0, n_classes=2, n_clusters_per_class=1, weights=None, flip_y=0.01,
                           hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)
tr, te, lab, unlab = split_features(feature_matrix=X, test_ratio=0.3, missing_rate=0.5,
                                    split_count=10, saving_path=None)

# Use the default Logistic Regression classifier
model = LogisticRegression()

# The cost budget is 50 times querying
stopping_criterion = StoppingCriteria('num_of_queries', 50)

AFASMC_result = []
rand_result =[]
Stable_result = []

# AFASMC
for i in range(5):
    train_idx = tr[i]
    test_idx = te[i]
    label_ind = MultiLabelIndexCollection(lab[i], label_size=X.shape[1])
    unlab_ind = MultiLabelIndexCollection(unlab[i], label_size=X.shape[1])
    saver = StateIO(i, train_idx, test_idx, label_ind, unlab_ind)
    strategy = QueryFeatureAFASMC(X=X, y=y, train_idx=train_idx)

    while not stopping_criterion.is_stop():
        # query
        selected_feature = strategy.select(observed_entries=label_ind, unkonwn_entries=unlab_ind)

        # update index
        label_ind.update(selected_feature)
        unlab_ind.difference_update(selected_feature)

        # train/test
        lab_in_train = map_whole_index_to_train(train_idx, label_ind)
        X_mc = AFASMC_mc(X=X[train_idx], y=y[train_idx], omega=lab_in_train)
        model.fit(X_mc, y[train_idx])
        pred = model.predict(X[test_idx])
        perf = accuracy_score(y_true=y[test_idx], y_pred=pred)

        # save
        st = State(select_index=selected_feature, performance=perf)
        saver.add_state(st)
        # saver.save()

        stopping_criterion.update_information(saver)

    stopping_criterion.reset()
    AFASMC_result.append(copy.deepcopy(saver))

SVD_mc = IterativeSVD_mc(rank=4)
# Stablility
for i in range(5):
    train_idx = tr[i]
    test_idx = te[i]
    label_ind = MultiLabelIndexCollection(lab[i], label_size=X.shape[1])
    unlab_ind = MultiLabelIndexCollection(unlab[i], label_size=X.shape[1])
    saver = StateIO(i, train_idx, test_idx, label_ind, unlab_ind)
    strategy = QueryFeatureStability(X=X, y=y, train_idx=train_idx, rank_arr=[4, 6, 8])

    while not stopping_criterion.is_stop():
        # query
        selected_feature = strategy.select(observed_entries=label_ind, unkonwn_entries=unlab_ind)

        # update index
        label_ind.update(selected_feature)
        unlab_ind.difference_update(selected_feature)

        # train/test
        lab_in_train = map_whole_index_to_train(train_idx, label_ind)
        X_mc = SVD_mc.impute(X[train_idx], observed_mask=lab_in_train.get_matrix_mask(mat_shape=(len(train_idx), X.shape[1]), sparse=False))
        model.fit(X_mc, y[train_idx])
        pred = model.predict(X[test_idx])
        perf = accuracy_score(y_true=y[test_idx], y_pred=pred)

        # save
        st = State(select_index=selected_feature, performance=perf)
        saver.add_state(st)

        stopping_criterion.update_information(saver)

    stopping_criterion.reset()
    Stable_result.append(copy.deepcopy(saver))

# rand
for i in range(5):
    train_idx = tr[i]
    test_idx = te[i]
    label_ind = MultiLabelIndexCollection(lab[i], label_size=X.shape[1])
    unlab_ind = MultiLabelIndexCollection(unlab[i], label_size=X.shape[1])
    saver = StateIO(i, train_idx, test_idx, label_ind, unlab_ind)
    strategy = QueryFeatureRandom()

    while not stopping_criterion.is_stop():
        # query
        selected_feature = strategy.select(observed_entries=label_ind, unkonwn_entries=unlab_ind)

        # update index
        label_ind.update(selected_feature)
        unlab_ind.difference_update(selected_feature)

        # train/test
        lab_in_train = map_whole_index_to_train(train_idx, label_ind)
        X_mc = SVD_mc.impute(X[train_idx], observed_mask=lab_in_train.get_matrix_mask(mat_shape=(len(train_idx), X.shape[1]), sparse=False))
        model.fit(X_mc, y[train_idx])
        pred = model.predict(X[test_idx])
        perf = accuracy_score(y_true=y[test_idx], y_pred=pred)

        # save
        st = State(select_index=selected_feature, performance=perf)
        saver.add_state(st)

        stopping_criterion.update_information(saver)

    stopping_criterion.reset()
    rand_result.append(copy.deepcopy(saver))

analyser = ExperimentAnalyser()
analyser.add_method(method_results=AFASMC_result, method_name='AFASMC')
analyser.add_method(method_results=Stable_result, method_name='Stability')
analyser.add_method(method_results=rand_result, method_name='Random')
print(analyser)
analyser.plot_learning_curves()
import numpy as np 
import copy

from sklearn.datasets import make_multilabel_classification
from sklearn.ensemble import RandomForestClassifier

from alipy import ToolBox
from alipy.index.multi_label_tools import get_Xy_in_multilabel
from alipy.query_strategy.cost_sensitive import QueryCostSensitiveHALC, QueryCostSensitivePerformance, QueryCostSensitiveRandom
from alipy.query_strategy.cost_sensitive import hierarchical_multilabel_mark

X, y = make_multilabel_classification(n_samples=2000, n_features=20, n_classes=5,
                                   n_labels=3, length=50, allow_unlabeled=True,
                                   sparse=False, return_indicator='dense',
                                   return_distributions=False,
                                   random_state=None)
y[y == 0] = -1

# the cost of each class
cost = [1, 3, 3, 7, 10]

# if node_i is the parent of node_j , then label_tree(i,j)=1 else 0
label_tree = np.zeros((5,5),dtype=np.int)
label_tree[0, 1] = 1
label_tree[0, 2] = 1
label_tree[1, 3] = 1
label_tree[2, 4] = 1

alibox = ToolBox(X=X, y=y, query_type='PartLabels')

# Split data
alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10, all_class=True)

# baseclassifier model use RFC
model = RandomForestClassifier()

# The budget of query
budget = 40

# The cost budget is 500
stopping_criterion = alibox.get_stopping_criterion('cost_limit', 500)

performance_result = []
halc_result = []
random_result = []

def main_loop(alibox, strategy, round):
    # Get the data split of one fold experiment
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)
    while not stopping_criterion.is_stop():
        # Select a subset of Uind according to the query strategy
        select_ind = strategy.select(label_ind, unlab_ind, cost=cost, budget=budget)
        # 
        select_ind = hierarchical_multilabel_mark(select_ind, label_ind, label_tree, y)

        label_ind.update(select_ind)
        unlab_ind.difference_update(select_ind)
            
        # Update model and calc performance according to the model you are using
        X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=y)
        model.fit(X_tr, y_tr)
        pred = model.predict(X[test_idx, :])
        pred[pred == 0] = 1

        performance = alibox.calc_performance_metric(y_true=y[test_idx], y_pred=pred, performance_metric='hamming_loss')

        # Save intermediate results to file
        st = alibox.State(select_index=select_ind.index, performance=performance, cost=budget)
        saver.add_state(st)
        # Passing the current progress to stopping criterion object
        stopping_criterion.update_information(saver)
    # Reset the progress in stopping criterion object
    stopping_criterion.reset()
    return saver

for round in range(5):
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Use pre-defined strategy
    random = QueryCostSensitiveRandom(X,y)
    perf = QueryCostSensitivePerformance(X, y)
    halc = QueryCostSensitiveHALC(X, y,label_tree=label_tree)

    random_result.append(copy.deepcopy(main_loop(alibox, random, round)))
    performance_result.append(copy.deepcopy(main_loop(alibox, perf, round)))
    halc_result.append(copy.deepcopy(main_loop(alibox, halc, round)))

analyser = alibox.get_experiment_analyser(x_axis='cost')
analyser.add_method(method_name='random', method_results=random_result)
analyser.add_method(method_name='performance', method_results=performance_result)
analyser.add_method(method_name='HALC', method_results=halc_result)

print(analyser)
analyser.plot_learning_curves(title='Example of cost-sensitive', std_area=False)
from alipy.toolbox import ToolBox
from alipy.oracle import Oracle, Oracles
from alipy.utils.misc import randperm
from alipy.query_strategy.noisy_oracles import QueryNoisyOraclesCEAL, QueryNoisyOraclesAll, \
    QueryNoisyOraclesIEthresh, QueryNoisyOraclesRandom, get_majority_vote
from sklearn.datasets import make_classification
import copy
import numpy as np

X, y = make_classification(n_samples=800, n_features=20, n_informative=2, n_redundant=2,
                           n_repeated=0, n_classes=2, n_clusters_per_class=1, weights=None, flip_y=0.01,
                           hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)

alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')

# Split data
alibox.split_AL(test_ratio=0.3, initial_label_rate=0.15, split_count=10)

# Use the default Logistic Regression classifier
model = alibox.get_default_model()

# The cost budget is 50 times querying
stopping_criterion = alibox.get_stopping_criterion('cost_limit', 30)

# initialize noisy oracles with different noise level
n_samples = len(y)
y1 = y.copy()
y2 = y.copy()
y3 = y.copy()
y4 = y.copy()
y5 = y.copy()
perms = randperm(n_samples-1)
y1[perms[0:round(n_samples*0.1)]] = 1-y1[perms[0:round(n_samples*0.1)]]
perms = randperm(n_samples-1)
y2[perms[0:round(n_samples*0.2)]] = 1-y2[perms[0:round(n_samples*0.2)]]
perms = randperm(n_samples-1)
y3[perms[0:round(n_samples*0.3)]] = 1-y3[perms[0:round(n_samples*0.3)]]
perms = randperm(n_samples-1)
y4[perms[0:round(n_samples*0.4)]] = 1-y4[perms[0:round(n_samples*0.4)]]
perms = randperm(n_samples-1)
y5[perms[0:round(n_samples*0.5)]] = 1-y5[perms[0:round(n_samples*0.5)]]
oracle1 = Oracle(labels=y1, cost=np.zeros(y.shape)+1.2)
oracle2 = Oracle(labels=y2, cost=np.zeros(y.shape)+.8)
oracle3 = Oracle(labels=y3, cost=np.zeros(y.shape)+.5)
oracle4 = Oracle(labels=y4, cost=np.zeros(y.shape)+.4)
oracle5 = Oracle(labels=y5, cost=np.zeros(y.shape)+.3)
oracle6 = Oracle(labels=[0]*n_samples, cost=np.zeros(y.shape)+.3)
oracle7 = Oracle(labels=[1]*n_samples, cost=np.zeros(y.shape)+.3)
oracles = Oracles()
oracles.add_oracle(oracle_name='o1', oracle_object=oracle1)
oracles.add_oracle(oracle_name='o2', oracle_object=oracle2)
oracles.add_oracle(oracle_name='o3', oracle_object=oracle3)
oracles.add_oracle(oracle_name='o4', oracle_object=oracle4)
# oracles.add_oracle(oracle_name='o5', oracle_object=oracle5)
oracles.add_oracle(oracle_name='oa0', oracle_object=oracle6)
oracles.add_oracle(oracle_name='oa1', oracle_object=oracle7)

# oracles_list = [oracle1, oracle2]

# def main loop
def al_loop(strategy, alibox, round):
    # Get the data split of one fold experiment
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)
    # Get repository to store noisy labels
    repo = alibox.get_repository(round)

    while not stopping_criterion.is_stop():
        # Query
        select_ind, select_ora = strategy.select(label_ind, unlab_ind)
        vote_count, vote_result, cost = get_majority_vote(selected_instance=select_ind, oracles=oracles, names=select_ora)
        repo.update_query(labels=vote_result, indexes=select_ind)

        # update ind
        label_ind.update(select_ind)
        unlab_ind.difference_update(select_ind)

        # Train/test
        _, y_lab, indexes_lab = repo.get_training_data()
        model.fit(X=X[indexes_lab], y=y_lab)
        pred = model.predict(X[test_idx])
        perf = alibox.calc_performance_metric(y_true=y[test_idx], y_pred=pred)

        # save
        st = alibox.State(select_index=select_ind, performance=perf, cost=cost)
        saver.add_state(st)

        stopping_criterion.update_information(saver)

    stopping_criterion.reset()
    return saver

ceal_result = []
iet_result = []
all_result = []
rand_result = []

for round in range(5):
    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # init strategies
    ceal = QueryNoisyOraclesCEAL(X, y, oracles=oracles, initial_labeled_indexes=label_ind)
    iet = QueryNoisyOraclesIEthresh(X=X, y=y, oracles=oracles, initial_labeled_indexes=label_ind)
    all = QueryNoisyOraclesAll(X=X, y=y, oracles=oracles)
    rand = QueryNoisyOraclesRandom(X=X, y=y, oracles=oracles)

    ceal_result.append(copy.deepcopy(al_loop(ceal, alibox, round)))
    iet_result.append(copy.deepcopy(al_loop(iet, alibox, round)))
    all_result.append(copy.deepcopy(al_loop(all, alibox, round)))
    rand_result.append(copy.deepcopy(al_loop(rand, alibox, round)))

print(oracles.full_history())
analyser = alibox.get_experiment_analyser(x_axis='cost')
analyser.add_method(method_results=ceal_result, method_name='ceal')
analyser.add_method(method_results=iet_result, method_name='iet')
analyser.add_method(method_results=all_result, method_name='all')
analyser.add_method(method_results=rand_result, method_name='rand')
analyser.plot_learning_curves()
import copy
import numpy as np
from sklearn.datasets import load_iris, make_multilabel_classification
from sklearn.preprocessing import OneHotEncoder, normalize
from sklearn.metrics import f1_score
from alipy.query_strategy.query_type import QueryTypeAURO
from alipy.query_strategy.multi_label import LabelRankingModel
from alipy.index.multi_label_tools import get_Xy_in_multilabel
from alipy import ToolBox

X, y = load_iris(return_X_y=True)
# X, mult_y = make_multilabel_classification(n_samples=2500, n_labels=3, n_classes=10, n_features=15)
mlb = OneHotEncoder()
mult_y = mlb.fit_transform(y.reshape((-1, 1)))
mult_y = np.asarray(mult_y.todense())
X = normalize(X, norm='l2')
mult_y_for_metric = mult_y.copy()
mult_y[mult_y == 0] = -1

alibox = ToolBox(X=X, y=mult_y, query_type='PartLabels')
alibox.split_AL(test_ratio=0.2, initial_label_rate=0.05, all_class=False)
model = LabelRankingModel() # base model

# query type strategy
AURO_results = []

for round in range(5):

    train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
    # Get intermediate results saver for one fold experiment
    saver = alibox.get_stateio(round)
    query_y = mult_y.copy() # for labeling `less relevant`
    AURO_strategy = QueryTypeAURO(X=X, y=mult_y)
    # init model
    X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=mult_y)
    model.fit(X=X_tr, y=y_tr)

    for iter in range(100):

        select_ins, select_y1, select_y2 = AURO_strategy.select(label_ind, unlab_ind, model=model, y_mat=query_y)

        # relevance
        y1 = mult_y[select_ins, select_y1]
        y2 = mult_y[select_ins, select_y2]
        if y1 < 0 and y2 < 0:
            query_y[select_ins, select_y1] = -1
            query_y[select_ins, select_y2] = -1
        elif y1 > y2:
            query_y[select_ins, select_y1] = 1
            query_y[select_ins, select_y2] = 0.5
        else:
            query_y[select_ins, select_y1] = 0.5
            query_y[select_ins, select_y2] = 1

        # record results
        label_ind.update([(select_ins, select_y1), (select_ins, select_y2)])
        unlab_ind.difference_update([(select_ins, select_y1), (select_ins, select_y2)])

        # train/test
        X_tr, y_tr, _ = get_Xy_in_multilabel([(select_ins, select_y1), (select_ins, select_y2)], X=X, y=query_y, unknown_element=0)
        model.fit(X=X_tr, y=y_tr, is_incremental=True)
        pres, pred = model.predict(X[test_idx])

        # using sklearn to calc micro-f1
        pred[pred == -1] = 0
        perf = f1_score(y_true=mult_y_for_metric[test_idx], y_pred=pred, average='micro')

        # save
        st = alibox.State(select_index=[(select_ins, select_y1), (select_ins, select_y2)], performance=perf)
        saver.add_state(st)


    AURO_results.append(copy.copy(saver))

analyser = alibox.get_experiment_analyser()
analyser.add_method(method_name='AURO', method_results=AURO_results)
analyser.plot_learning_curves(plot_interval=5)
import numpy as np
from sklearn.linear_model import LogisticRegression
from alipy.query_strategy import QueryInstanceUncertainty
from alipy.index import IndexCollection
from alipy.oracle import MatrixRepository

# Your labeled set
X_lab = np.random.randn(100, 10)
y_lab = np.random.randint(low=0, high=2, size=100)
# The unlabeled pool, the labels of unlabeled data can be anything. The algorithm will not use them.
X_unlab = np.random.rand(100,10)
y_place_holder = np.random.randint(low=0, high=2, size=100)

# Initialize a query strategy.
unc = QueryInstanceUncertainty(X=np.vstack((X_unlab, X_lab)), y=np.hstack((y_place_holder, y_lab)))
unlab_ind = IndexCollection(np.arange(100))   # Indexes of your unlabeled set for querying
label_ind = IndexCollection(np.arange(start=100, stop=200))  # Indexes of your labeled set
labeled_repo = MatrixRepository(examples=X_lab, labels=y_lab, indexes=label_ind)   # Create a repository to store the labeled instances

# Initialize your model
model = LogisticRegression(solver='liblinear')
model.fit(X_lab, y_lab)

# Set the stopping criterion
for i in range(50):
    # Use a sklearn model to select instances.
    select_ind = unc.select(label_index=label_ind, unlabel_index=unlab_ind, model=model, batch_size=1)
    label_ind.update(select_ind)
    unlab_ind.difference_update(select_ind)

    # Label the selected instance here
    selected_instance = X_unlab[select_ind]
    # Replace this line with your own labeling code ! But not always labeling instances with 1.
    lab_of_ins = 1

    # Add the labeled example to the repo
    labeled_repo.update_query(labels=lab_of_ins, indexes=select_ind, examples=selected_instance)

    # If you are using your own model, update your model here, and pass it to unc.select()
    X_tr, y_tr, ind = labeled_repo.get_training_data()
    model.fit(X_lab, y_lab)

    # Update the label matrix of the query strategy here in just case that the algorithm may use the labels of labeled set
    unc.y[select_ind] = lab_of_ins

# Display the labeling history
print(labeled_repo.full_history())

import pickle
with open('my_labeled_set.pkl', 'wb') as f:
    pickle.dump(labeled_repo, f)

Copyright © 2018, alipy developers (BSD 3 License).