"""
Main flow of artificial set data generator program
This module should be treated as a client of the library
"""

import artificial_set_data_generator as dg
import data_utilities
import random
import numpy as np

# Parameters
DATA_SIZE = 1200
NUMBER_OF_CLUSTER = 16
SIZE_OF_CLUSTERS = [] # type 1
#SIZE_OF_CLUSTERS = [120, 120, 120, 120, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60]  # type 2
#SIZE_OF_CLUSTERS = [150, 150, 150, 150, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]  # type 3
#SIZE_OF_CLUSTERS = [187, 187, 188, 188, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38]  # type 4
#SIZE_OF_CLUSTERS = [210, 210, 210, 210, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30]  # type 5
UNBALANCE_TYPE = 1 # 1: equal size, 2: 4x120 and 12x60, 3: 4x150 and 12x50, 4: 4x187-8 and 12x37-8, 5: 4x210, 12x30

DIMENSION = 200
DISTANCE_THRESHOLD = 0.79 # Play with this to get the desired overlap
SIZE_OF_SET = (4, 20) # Min and max number of items in each data point
ALL_FEATURE_ICD10_FILE_PATH = '1300_disease_codes.txt'

random.seed(10)

gt_representative = []

# Get random features from all features with the specified size 'DIMENSION'
all_features = data_utilities.read_file(ALL_FEATURE_ICD10_FILE_PATH)
all_features = np.array(random.sample(all_features.tolist()[0], DIMENSION))

# Calling the library
data, ground_truth_labels, representatives, overlap_percentage = dg.generate(
    DATA_SIZE, 
    SIZE_OF_CLUSTERS, 
    NUMBER_OF_CLUSTER, 
    DIMENSION, 
    DISTANCE_THRESHOLD, 
    SIZE_OF_SET,
    all_features,
    gt_representative)

print('--- overlap percentage is:', overlap_percentage)

overlap_percentage = round(overlap_percentage)

OUT_DATA_FILE_NAME = '../datasets2/data_{}_{}_{}_{}_{}.txt'.format(DATA_SIZE, DIMENSION, NUMBER_OF_CLUSTER, overlap_percentage, UNBALANCE_TYPE)
OUT_GT_REPRESENTATIVE_FILE_NAME = '../datasets2/representative_{}_{}_{}_{}_{}.txt'.format(DATA_SIZE, DIMENSION, NUMBER_OF_CLUSTER, overlap_percentage, UNBALANCE_TYPE)
OUT_GT_LABELS_FILE_NAME = '../datasets2/labels_{}_{}_{}_{}_{}.txt'.format(DATA_SIZE, DIMENSION, NUMBER_OF_CLUSTER, overlap_percentage, UNBALANCE_TYPE)

data_utilities.write_file(data, OUT_DATA_FILE_NAME)
data_utilities.write_file(representatives, OUT_GT_REPRESENTATIVE_FILE_NAME)
np.savetxt(OUT_GT_LABELS_FILE_NAME, ground_truth_labels.T, fmt='%d') 
