Example usages of ALiPy. You can also download them from the github repository.
import copy
from sklearn.datasets import make_classification
from alipy import ToolBox
X, y = make_classification(n_samples=500, n_features=10, n_informative=5, n_redundant=2,
n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0,
hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)
alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')
# Split data
alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10)
# Use the default Logistic Regression classifier
model = alibox.get_default_model()
# The cost budget is 50 times querying
stopping_criterion = alibox.get_stopping_criterion('num_of_queries', 50)
def main_loop(alibox, strategy, round):
# Get the data split of one fold experiment
train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
# Get intermediate results saver for one fold experiment
saver = alibox.get_stateio(round)
# Set initial performance point
model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
pred = model.predict(X[test_idx, :])
accuracy = alibox.calc_performance_metric(y_true=y[test_idx],
y_pred=pred,
performance_metric='accuracy_score')
saver.set_initial_point(accuracy)
# If the stopping criterion is simple, such as query 50 times. Use `for i in range(50):` is ok.
while not stopping_criterion.is_stop():
# Select a subset of Uind according to the query strategy
# Passing model=None to use the default model for evaluating the committees' disagreement
select_ind = strategy.select(label_index=label_ind, unlabel_index=unlab_ind, batch_size=1)
label_ind.update(select_ind)
unlab_ind.difference_update(select_ind)
# Update model and calc performance according to the model you are using
model.fit(X=X[label_ind.index, :], y=y[label_ind.index])
pred = model.predict(X[test_idx, :])
accuracy = alibox.calc_performance_metric(y_true=y[test_idx],
y_pred=pred,
performance_metric='accuracy_score')
# Save intermediate results to file
st = alibox.State(select_index=select_ind, performance=accuracy)
saver.add_state(st)
# Passing the current progress to stopping criterion object
stopping_criterion.update_information(saver)
# Reset the progress in stopping criterion object
stopping_criterion.reset()
return saver
unc_result = []
qbc_result = []
eer_result = []
quire_result = []
density_result = []
bmdr_result = []
spal_result = []
lal_result = []
rnd_result = []
_I_have_installed_the_cvxpy = False
for round in range(5):
train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
# Use pre-defined strategy
unc = alibox.get_query_strategy(strategy_name="QueryInstanceUncertainty")
qbc = alibox.get_query_strategy(strategy_name="QueryInstanceQBC")
eer = alibox.get_query_strategy(strategy_name="QueryExpectedErrorReduction")
rnd = alibox.get_query_strategy(strategy_name="QueryInstanceRandom")
quire = alibox.get_query_strategy(strategy_name="QueryInstanceQUIRE", train_idx=train_idx)
density = alibox.get_query_strategy(strategy_name="QueryInstanceGraphDensity", train_idx=train_idx)
lal = alibox.get_query_strategy(strategy_name="QueryInstanceLAL", cls_est=10, train_slt=False)
lal.download_data()
lal.train_selector_from_file(reg_est=30, reg_depth=5)
unc_result.append(copy.deepcopy(main_loop(alibox, unc, round)))
qbc_result.append(copy.deepcopy(main_loop(alibox, qbc, round)))
eer_result.append(copy.deepcopy(main_loop(alibox, eer, round)))
rnd_result.append(copy.deepcopy(main_loop(alibox, rnd, round)))
quire_result.append(copy.deepcopy(main_loop(alibox, quire, round)))
density_result.append(copy.deepcopy(main_loop(alibox, density, round)))
lal_result.append(copy.deepcopy(main_loop(alibox, lal, round)))
if _I_have_installed_the_cvxpy:
bmdr = alibox.get_query_strategy(strategy_name="QueryInstanceBMDR", kernel='rbf')
spal = alibox.get_query_strategy(strategy_name="QueryInstanceSPAL", kernel='rbf')
bmdr_result.append(copy.deepcopy(main_loop(alibox, bmdr, round)))
spal_result.append(copy.deepcopy(main_loop(alibox, spal, round)))
analyser = alibox.get_experiment_analyser(x_axis='num_of_queries')
analyser.add_method(method_name='QBC', method_results=qbc_result)
analyser.add_method(method_name='Unc', method_results=unc_result)
analyser.add_method(method_name='EER', method_results=eer_result)
analyser.add_method(method_name='Random', method_results=rnd_result)
analyser.add_method(method_name='QUIRE', method_results=quire_result)
analyser.add_method(method_name='Density', method_results=density_result)
analyser.add_method(method_name='LAL', method_results=lal_result)
if _I_have_installed_the_cvxpy:
analyser.add_method(method_name='BMDR', method_results=bmdr_result)
analyser.add_method(method_name='SPAL', method_results=spal_result)
print(analyser)
analyser.plot_learning_curves(title='Example of alipy', std_area=False)
import copy
import numpy as np
from sklearn.datasets import load_iris
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder, normalize
from alipy import ToolBox
from alipy.index.multi_label_tools import get_Xy_in_multilabel
from alipy.query_strategy.multi_label import *
X, y = load_iris(return_X_y=True)
X = normalize(X, norm='l2')
mlb = OneHotEncoder()
mult_y = mlb.fit_transform(y.reshape((-1, 1)))
mult_y = np.asarray(mult_y.todense())
mult_y_for_metric = mult_y.copy()
# Or generate a dataset with any sizes
# X, mult_y = make_multilabel_classification(n_samples=5000, n_features=20, n_classes=5, length=5)
# Since we are using the label ranking model, the label 0 means unknown. we need to
# set the 0 entries to -1 which means irrelevant.
mult_y[mult_y == 0] = -1
alibox = ToolBox(X=X, y=mult_y, query_type='PartLabels')
alibox.split_AL(test_ratio=0.2, initial_label_rate=0.05, all_class=False)
model = LabelRankingModel() # base model
def main_loop(alibox, round, strategy):
train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
# Get intermediate results saver for one fold experiment
saver = alibox.get_stateio(round)
# init model
X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=mult_y, unknown_element=0)
model.fit(X=X_tr, y=y_tr)
ini_lab_num = len(label_ind)
# A simple stopping criterion to specify the query budget.
while len(label_ind) - ini_lab_num <= 120:
# query and update
if isinstance(strategy, QueryMultiLabelAUDI):
# If you are using a label ranking model, pass it to AUDI. It can
# avoid re-training a label ranking model inside the algorithm
select_labs = strategy.select(label_ind, unlab_ind, model=model)
else:
select_labs = strategy.select(label_ind, unlab_ind)
# use cost to record the amount of queried instance-label pairs
if len(select_labs[0]) == 1:
cost = mult_y.shape[1]
else:
cost = len(select_labs)
label_ind.update(select_labs)
unlab_ind.difference_update(select_labs)
# train/test
X_tr, y_tr, _ = get_Xy_in_multilabel(select_labs, X=X, y=mult_y, unknown_element=0)
model.fit(X=X_tr, y=y_tr, is_incremental=True)
pres, pred = model.predict(X[test_idx])
pred[pred == -1] = 0
perf = f1_score(y_true=mult_y_for_metric[test_idx], y_pred=pred, average='micro')
# save
st = alibox.State(select_index=select_labs, performance=perf, cost=cost)
saver.add_state(st)
saver.save()
return copy.deepcopy(saver)
audi_result = []
quire_result = []
random_result = []
mmc_result = []
adaptive_result = []
for round in range(3):
# init strategies
audi = QueryMultiLabelAUDI(X, mult_y)
quire = QueryMultiLabelQUIRE(X, mult_y, kernel='rbf')
mmc = QueryMultiLabelMMC(X, mult_y)
adaptive = QueryMultiLabelAdaptive(X, mult_y)
random = QueryMultiLabelRandom(select_type='ins')
audi_result.append(main_loop(alibox, round, strategy=audi))
quire_result.append(main_loop(alibox, round, strategy=quire))
mmc_result.append(main_loop(alibox, round, strategy=mmc))
adaptive_result.append(main_loop(alibox, round, strategy=adaptive))
random_result.append(main_loop(alibox, round, strategy=random))
analyser = alibox.get_experiment_analyser(x_axis='cost')
analyser.add_method(method_name='AUDI', method_results=audi_result)
analyser.add_method(method_name='QUIRE', method_results=quire_result)
analyser.add_method(method_name='RANDOM', method_results=random_result)
analyser.add_method(method_name='MMC', method_results=mmc_result)
analyser.add_method(method_name='Adaptive', method_results=adaptive_result)
analyser.plot_learning_curves(plot_interval=3) # plot a performance point in every 3 queries of instance-label pairs
import copy
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from alipy.data_manipulate.al_split import split_features
from alipy.query_strategy.query_features import QueryFeatureAFASMC, QueryFeatureRandom, QueryFeatureStability, \
AFASMC_mc, IterativeSVD_mc
from alipy.index import MultiLabelIndexCollection
from alipy.experiment.stopping_criteria import StoppingCriteria
from alipy.experiment import StateIO, State, ExperimentAnalyser
from alipy.metrics import accuracy_score
from alipy.index import map_whole_index_to_train
# load and split data
X, y = make_classification(n_samples=800, n_features=20, n_informative=2, n_redundant=2,
n_repeated=0, n_classes=2, n_clusters_per_class=1, weights=None, flip_y=0.01,
hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)
tr, te, lab, unlab = split_features(feature_matrix=X, test_ratio=0.3, missing_rate=0.5,
split_count=10, saving_path=None)
# Use the default Logistic Regression classifier
model = LogisticRegression(solver='liblinear')
# The cost budget is 50 times querying
stopping_criterion = StoppingCriteria('num_of_queries', 50)
AFASMC_result = []
rand_result =[]
Stable_result = []
# AFASMC
for i in range(5):
train_idx = tr[i]
test_idx = te[i]
label_ind = MultiLabelIndexCollection(lab[i], label_size=X.shape[1])
unlab_ind = MultiLabelIndexCollection(unlab[i], label_size=X.shape[1])
saver = StateIO(i, train_idx, test_idx, label_ind, unlab_ind)
strategy = QueryFeatureAFASMC(X=X, y=y, train_idx=train_idx)
while not stopping_criterion.is_stop():
# query
selected_feature = strategy.select(observed_entries=label_ind, unkonwn_entries=unlab_ind)
# update index
label_ind.update(selected_feature)
unlab_ind.difference_update(selected_feature)
# train/test
lab_in_train = map_whole_index_to_train(train_idx, label_ind)
X_mc = AFASMC_mc(X=X[train_idx], y=y[train_idx], omega=lab_in_train)
model.fit(X_mc, y[train_idx])
pred = model.predict(X[test_idx])
perf = accuracy_score(y_true=y[test_idx], y_pred=pred)
# save
st = State(select_index=selected_feature, performance=perf)
saver.add_state(st)
# saver.save()
stopping_criterion.update_information(saver)
stopping_criterion.reset()
AFASMC_result.append(copy.deepcopy(saver))
SVD_mc = IterativeSVD_mc(rank=4)
# Stablility
for i in range(5):
train_idx = tr[i]
test_idx = te[i]
label_ind = MultiLabelIndexCollection(lab[i], label_size=X.shape[1])
unlab_ind = MultiLabelIndexCollection(unlab[i], label_size=X.shape[1])
saver = StateIO(i, train_idx, test_idx, label_ind, unlab_ind)
strategy = QueryFeatureStability(X=X, y=y, train_idx=train_idx, rank_arr=[4, 6, 8])
while not stopping_criterion.is_stop():
# query
selected_feature = strategy.select(observed_entries=label_ind, unkonwn_entries=unlab_ind)
# update index
label_ind.update(selected_feature)
unlab_ind.difference_update(selected_feature)
# train/test
lab_in_train = map_whole_index_to_train(train_idx, label_ind)
X_mc = SVD_mc.impute(X[train_idx], observed_mask=lab_in_train.get_matrix_mask(mat_shape=(len(train_idx), X.shape[1]), sparse=False))
model.fit(X_mc, y[train_idx])
pred = model.predict(X[test_idx])
perf = accuracy_score(y_true=y[test_idx], y_pred=pred)
# save
st = State(select_index=selected_feature, performance=perf)
saver.add_state(st)
stopping_criterion.update_information(saver)
stopping_criterion.reset()
Stable_result.append(copy.deepcopy(saver))
# rand
for i in range(5):
train_idx = tr[i]
test_idx = te[i]
label_ind = MultiLabelIndexCollection(lab[i], label_size=X.shape[1])
unlab_ind = MultiLabelIndexCollection(unlab[i], label_size=X.shape[1])
saver = StateIO(i, train_idx, test_idx, label_ind, unlab_ind)
strategy = QueryFeatureRandom()
while not stopping_criterion.is_stop():
# query
selected_feature = strategy.select(observed_entries=label_ind, unkonwn_entries=unlab_ind)
# update index
label_ind.update(selected_feature)
unlab_ind.difference_update(selected_feature)
# train/test
lab_in_train = map_whole_index_to_train(train_idx, label_ind)
X_mc = SVD_mc.impute(X[train_idx], observed_mask=lab_in_train.get_matrix_mask(mat_shape=(len(train_idx), X.shape[1]), sparse=False))
model.fit(X_mc, y[train_idx])
pred = model.predict(X[test_idx])
perf = accuracy_score(y_true=y[test_idx], y_pred=pred)
# save
st = State(select_index=selected_feature, performance=perf)
saver.add_state(st)
stopping_criterion.update_information(saver)
stopping_criterion.reset()
rand_result.append(copy.deepcopy(saver))
analyser = ExperimentAnalyser()
analyser.add_method(method_results=AFASMC_result, method_name='AFASMC')
analyser.add_method(method_results=Stable_result, method_name='Stability')
analyser.add_method(method_results=rand_result, method_name='Random')
print(analyser)
analyser.plot_learning_curves()
import numpy as np
import copy
from sklearn.datasets import make_multilabel_classification
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from alipy import ToolBox
from alipy.index.multi_label_tools import get_Xy_in_multilabel
from alipy.query_strategy.cost_sensitive import QueryCostSensitiveHALC, QueryCostSensitivePerformance, QueryCostSensitiveRandom
from alipy.query_strategy.cost_sensitive import hierarchical_multilabel_mark
# the num of classes of the classification problem
NUM_CLASS = 5
NUM_SAMPLES = 2000
X, y = make_multilabel_classification(n_samples=NUM_SAMPLES, n_features=20, n_classes=NUM_CLASS,
n_labels=3, length=50, allow_unlabeled=True,
sparse=False, return_indicator='dense',
return_distributions=False,
random_state=None)
y[y == 0] = -1
# the cost of each class
cost = [1, 3, 3, 7, 10]
# if node_i is the parent of node_j , then label_tree(i,j)=1 else 0
label_tree = np.zeros((5,5),dtype=np.int)
label_tree[0, 1] = 1
label_tree[0, 2] = 1
label_tree[1, 3] = 1
label_tree[2, 4] = 1
alibox = ToolBox(X=X, y=y, query_type='PartLabels')
# Split data
alibox.split_AL(test_ratio=0.3, initial_label_rate=0.1, split_count=10, all_class=True)
# train one model for each label on dataset
# the base model using SVC in sklearn
models = []
for __ in range(NUM_CLASS):
models.append(SVC(decision_function_shape='ovr', gamma='auto'))
# The budget of query
budget = 40
# The cost budget is 500
stopping_criterion = alibox.get_stopping_criterion('cost_limit', 500)
performance_result = []
halc_result = []
random_result = []
def main_loop(alibox, strategy, round):
# Get the data split of one fold experiment
train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
# Get intermediate results saver for one fold experiment
saver = alibox.get_stateio(round)
# initalizing the models
train_traget = label_ind.get_matrix_mask((NUM_SAMPLES, NUM_CLASS), sparse=False)
for j in np.arange(NUM_CLASS):
j_target = train_traget[:, j]
i_samples = np.where(j_target!=0)[0]
m = models[j]
m.fit(X[i_samples, :], y[i_samples, j])
while not stopping_criterion.is_stop():
# Select a subset of Uind according to the query strategy
select_ind = strategy.select(label_ind, unlab_ind, cost=cost, budget=budget, models=models)
select_ind = hierarchical_multilabel_mark(select_ind, label_ind, label_tree, y)
label_ind.update(select_ind)
unlab_ind.difference_update(select_ind)
# Update model and calc performance according to the model you are using
train_traget = label_ind.get_matrix_mask((NUM_SAMPLES, NUM_CLASS), sparse=False)
for j in np.arange(NUM_CLASS):
j_target = train_traget[:, j]
i_samples = np.where(j_target!=0)[0]
m = models[j]
m.fit(X[i_samples, :], y[i_samples, j])
pred = None
for j in np.arange(NUM_CLASS):
model = models[j]
pred_j = model.predict(X[test_idx])
if pred is None:
pred = pred_j.reshape((len(test_idx), 1))
else:
pred = np.hstack((pred, pred_j.reshape((len(test_idx), 1))))
performance = alibox.calc_performance_metric(y_true=y[test_idx], y_pred=pred, performance_metric='hamming_loss')
# Save intermediate results to file
st = alibox.State(select_index=select_ind.index, performance=performance, cost=budget)
saver.add_state(st)
# Passing the current progress to stopping criterion object
stopping_criterion.update_information(saver)
# Reset the progress in stopping criterion object
stopping_criterion.reset()
return saver
for round in range(5):
train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
# Use pre-defined strategy
random = QueryCostSensitiveRandom(X,y)
perf = QueryCostSensitivePerformance(X, y)
halc = QueryCostSensitiveHALC(X, y,label_tree=label_tree)
random_result.append(copy.deepcopy(main_loop(alibox, random, round)))
performance_result.append(copy.deepcopy(main_loop(alibox, perf, round)))
halc_result.append(copy.deepcopy(main_loop(alibox, halc, round)))
analyser = alibox.get_experiment_analyser(x_axis='cost')
analyser.add_method(method_name='random', method_results=random_result)
analyser.add_method(method_name='performance', method_results=performance_result)
analyser.add_method(method_name='HALC', method_results=halc_result)
print(analyser)
analyser.plot_learning_curves(title='Example of cost-sensitive', std_area=False)
from alipy.toolbox import ToolBox
from alipy.oracle import Oracle, Oracles
from alipy.utils.misc import randperm
from alipy.query_strategy.noisy_oracles import QueryNoisyOraclesCEAL, QueryNoisyOraclesAll, \
QueryNoisyOraclesIEthresh, QueryNoisyOraclesRandom, get_majority_vote
from sklearn.datasets import make_classification
import copy
import numpy as np
X, y = make_classification(n_samples=800, n_features=20, n_informative=2, n_redundant=2,
n_repeated=0, n_classes=2, n_clusters_per_class=1, weights=None, flip_y=0.01,
hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None)
alibox = ToolBox(X=X, y=y, query_type='AllLabels', saving_path='.')
# Split data
alibox.split_AL(test_ratio=0.3, initial_label_rate=0.15, split_count=10)
# Use the default Logistic Regression classifier
model = alibox.get_default_model()
# The cost budget is 50 times querying
stopping_criterion = alibox.get_stopping_criterion('cost_limit', 30)
# initialize noisy oracles with different noise level
n_samples = len(y)
y1 = y.copy()
y2 = y.copy()
y3 = y.copy()
y4 = y.copy()
y5 = y.copy()
perms = randperm(n_samples-1)
y1[perms[0:round(n_samples*0.1)]] = 1-y1[perms[0:round(n_samples*0.1)]]
perms = randperm(n_samples-1)
y2[perms[0:round(n_samples*0.2)]] = 1-y2[perms[0:round(n_samples*0.2)]]
perms = randperm(n_samples-1)
y3[perms[0:round(n_samples*0.3)]] = 1-y3[perms[0:round(n_samples*0.3)]]
perms = randperm(n_samples-1)
y4[perms[0:round(n_samples*0.4)]] = 1-y4[perms[0:round(n_samples*0.4)]]
perms = randperm(n_samples-1)
y5[perms[0:round(n_samples*0.5)]] = 1-y5[perms[0:round(n_samples*0.5)]]
oracle1 = Oracle(labels=y1, cost=np.zeros(y.shape)+1.2)
oracle2 = Oracle(labels=y2, cost=np.zeros(y.shape)+.8)
oracle3 = Oracle(labels=y3, cost=np.zeros(y.shape)+.5)
oracle4 = Oracle(labels=y4, cost=np.zeros(y.shape)+.4)
oracle5 = Oracle(labels=y5, cost=np.zeros(y.shape)+.3)
oracle6 = Oracle(labels=[0]*n_samples, cost=np.zeros(y.shape)+.3)
oracle7 = Oracle(labels=[1]*n_samples, cost=np.zeros(y.shape)+.3)
oracles = Oracles()
oracles.add_oracle(oracle_name='o1', oracle_object=oracle1)
oracles.add_oracle(oracle_name='o2', oracle_object=oracle2)
oracles.add_oracle(oracle_name='o3', oracle_object=oracle3)
oracles.add_oracle(oracle_name='o4', oracle_object=oracle4)
# oracles.add_oracle(oracle_name='o5', oracle_object=oracle5)
oracles.add_oracle(oracle_name='oa0', oracle_object=oracle6)
oracles.add_oracle(oracle_name='oa1', oracle_object=oracle7)
# oracles_list = [oracle1, oracle2]
# def main loop
def al_loop(strategy, alibox, round):
# Get the data split of one fold experiment
train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
# Get intermediate results saver for one fold experiment
saver = alibox.get_stateio(round)
# Get repository to store noisy labels
repo = alibox.get_repository(round)
while not stopping_criterion.is_stop():
# Query
select_ind, select_ora = strategy.select(label_ind, unlab_ind)
vote_count, vote_result, cost = get_majority_vote(selected_instance=select_ind, oracles=oracles, names=select_ora)
repo.update_query(labels=vote_result, indexes=select_ind)
# update ind
label_ind.update(select_ind)
unlab_ind.difference_update(select_ind)
# Train/test
_, y_lab, indexes_lab = repo.get_training_data()
model.fit(X=X[indexes_lab], y=y_lab)
pred = model.predict(X[test_idx])
perf = alibox.calc_performance_metric(y_true=y[test_idx], y_pred=pred)
# save
st = alibox.State(select_index=select_ind, performance=perf, cost=cost)
saver.add_state(st)
stopping_criterion.update_information(saver)
stopping_criterion.reset()
return saver
ceal_result = []
iet_result = []
all_result = []
rand_result = []
for round in range(5):
train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
# init strategies
ceal = QueryNoisyOraclesCEAL(X, y, oracles=oracles, initial_labeled_indexes=label_ind)
iet = QueryNoisyOraclesIEthresh(X=X, y=y, oracles=oracles, initial_labeled_indexes=label_ind)
all = QueryNoisyOraclesAll(X=X, y=y, oracles=oracles)
rand = QueryNoisyOraclesRandom(X=X, y=y, oracles=oracles)
ceal_result.append(copy.deepcopy(al_loop(ceal, alibox, round)))
iet_result.append(copy.deepcopy(al_loop(iet, alibox, round)))
all_result.append(copy.deepcopy(al_loop(all, alibox, round)))
rand_result.append(copy.deepcopy(al_loop(rand, alibox, round)))
print(oracles.full_history())
analyser = alibox.get_experiment_analyser(x_axis='cost')
analyser.add_method(method_results=ceal_result, method_name='ceal')
analyser.add_method(method_results=iet_result, method_name='iet')
analyser.add_method(method_results=all_result, method_name='all')
analyser.add_method(method_results=rand_result, method_name='rand')
analyser.plot_learning_curves()
import copy
import numpy as np
from sklearn.datasets import load_iris, make_multilabel_classification
from sklearn.preprocessing import OneHotEncoder, normalize
from sklearn.metrics import f1_score
from alipy.query_strategy.query_type import QueryTypeAURO
from alipy.query_strategy.multi_label import LabelRankingModel
from alipy.index.multi_label_tools import get_Xy_in_multilabel
from alipy import ToolBox
X, y = load_iris(return_X_y=True)
# X, mult_y = make_multilabel_classification(n_samples=2500, n_labels=3, n_classes=10, n_features=15)
mlb = OneHotEncoder()
mult_y = mlb.fit_transform(y.reshape((-1, 1)))
mult_y = np.asarray(mult_y.todense())
X = normalize(X, norm='l2')
mult_y_for_metric = mult_y.copy()
mult_y[mult_y == 0] = -1
alibox = ToolBox(X=X, y=mult_y, query_type='PartLabels')
alibox.split_AL(test_ratio=0.2, initial_label_rate=0.05, all_class=False)
model = LabelRankingModel() # base model
# query type strategy
AURO_results = []
for round in range(5):
train_idx, test_idx, label_ind, unlab_ind = alibox.get_split(round)
# Get intermediate results saver for one fold experiment
saver = alibox.get_stateio(round)
query_y = mult_y.copy() # for labeling `less relevant`
AURO_strategy = QueryTypeAURO(X=X, y=mult_y)
# init model
X_tr, y_tr, _ = get_Xy_in_multilabel(label_ind, X=X, y=mult_y)
model.fit(X=X_tr, y=y_tr)
for iter in range(100):
select_ins, select_y1, select_y2 = AURO_strategy.select(label_ind, unlab_ind, model=model, y_mat=query_y)
# relevance
y1 = mult_y[select_ins, select_y1]
y2 = mult_y[select_ins, select_y2]
if y1 < 0 and y2 < 0:
query_y[select_ins, select_y1] = -1
query_y[select_ins, select_y2] = -1
elif y1 > y2:
query_y[select_ins, select_y1] = 1
query_y[select_ins, select_y2] = 0.5
else:
query_y[select_ins, select_y1] = 0.5
query_y[select_ins, select_y2] = 1
# record results
label_ind.update([(select_ins, select_y1), (select_ins, select_y2)])
unlab_ind.difference_update([(select_ins, select_y1), (select_ins, select_y2)])
# train/test
X_tr, y_tr, _ = get_Xy_in_multilabel([(select_ins, select_y1), (select_ins, select_y2)], X=X, y=query_y, unknown_element=0)
model.fit(X=X_tr, y=y_tr, is_incremental=True)
pres, pred = model.predict(X[test_idx])
# using sklearn to calc micro-f1
pred[pred == -1] = 0
perf = f1_score(y_true=mult_y_for_metric[test_idx], y_pred=pred, average='micro')
# save
st = alibox.State(select_index=[(select_ins, select_y1), (select_ins, select_y2)], performance=perf)
saver.add_state(st)
AURO_results.append(copy.copy(saver))
analyser = alibox.get_experiment_analyser()
analyser.add_method(method_name='AURO', method_results=AURO_results)
analyser.plot_learning_curves(plot_interval=5)
import numpy as np
from sklearn.linear_model import LogisticRegression
from alipy.query_strategy import QueryInstanceUncertainty
from alipy.index import IndexCollection
from alipy.oracle import MatrixRepository
# Your labeled set
X_lab = np.random.randn(100, 10)
y_lab = np.random.randint(low=0, high=2, size=100)
# The unlabeled pool, the labels of unlabeled data can be anything. The algorithm will not use them.
X_unlab = np.random.rand(100,10)
y_place_holder = np.random.randint(low=0, high=2, size=100)
# Initialize a query strategy.
unc = QueryInstanceUncertainty(X=np.vstack((X_unlab, X_lab)), y=np.hstack((y_place_holder, y_lab)))
unlab_ind = IndexCollection(np.arange(100)) # Indexes of your unlabeled set for querying
label_ind = IndexCollection(np.arange(start=100, stop=200)) # Indexes of your labeled set
labeled_repo = MatrixRepository(examples=X_lab, labels=y_lab, indexes=label_ind) # Create a repository to store the labeled instances
# Initialize your model
model = LogisticRegression(solver='liblinear')
model.fit(X_lab, y_lab)
# Set the stopping criterion
for i in range(50):
# Use a sklearn model to select instances.
select_ind = unc.select(label_index=label_ind, unlabel_index=unlab_ind, model=model, batch_size=1)
label_ind.update(select_ind)
unlab_ind.difference_update(select_ind)
# Label the selected instance here
selected_instance = X_unlab[select_ind]
# Replace this line with your own labeling code ! But not always labeling instances with 1.
lab_of_ins = 1
# Add the labeled example to the repo
labeled_repo.update_query(labels=lab_of_ins, indexes=select_ind, examples=selected_instance)
# If you are using your own model, update your model here, and pass it to unc.select()
X_tr, y_tr, ind = labeled_repo.get_training_data()
model.fit(X_lab, y_lab)
# Update the label matrix of the query strategy here in just case that the algorithm may use the labels of labeled set
unc.y[select_ind] = lab_of_ins
# Display the labeling history
print(labeled_repo.full_history())
import pickle
with open('my_labeled_set.pkl', 'wb') as f:
pickle.dump(labeled_repo, f)
Copyright © 2018, alipy developers (BSD 3 License).