###########################################################
#Computational Linguistics 18-19
#DISSECT tutorial
#How to: PLAY WITH DISSECT - COMPOSITIONAL DSMs
#sandro.pezzelle@unitn.it
###########################################################

#demos and tutorials can be found here:
#http://clic.cimec.unitn.it/composes/toolkit/
#here we will use some of the tutorials at the page:
#http://clic.cimec.unitn.it/composes/toolkit/composing.html

#more info on compositional models in DISSECT:
#http://clic.cimec.unitn.it/composes/toolkit/compose.html

###########################################################

# CHECK-POINT QUESTIONS: 
# 1. Why COSINE for measuring similarity?
# 2. What's the intuition behind applying SVD to vectors?
# 3. Why do we need COMPOSITIONAL MODELS?

###########################################################

#A BRIEF RECAP: LOAD SPACE, COMPUTE COSINE SIM, NEIGHBORS

###########################################################

#You can use pre-trained, existing embeddings like w2v, glove, etc.
#here below some repositories:
#https://github.com/3Top/word2vec-api
#https://github.com/Hironsan/awesome-embedding-models

#I suggest using gensim for building spaces from raw text
#https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec
#you should already have it installed...

#to simplify things, I provided you with a semantic space
#built using SoA word2vec. You downloaded the file in .bin,
#which is the default format you get out of word2vec et similia
#we will now convert it into .pkl using a script that does the job
#then we'll play with the DISSECT toolkit on it. READY?

cd ~
cd dissect-env/class/class-stuff/
#if you didn't do it already...
#python2.7 bin-to-pkl.py
#to have the .pkl version of the space

python2.7
from composes.semantic_space.space import Space
from composes.similarity.cos import CosSimilarity
from composes.utils import io_utils
from composes.utils import log_utils

count_space = io_utils.load("./text8.pkl")
predict_space= io_utils.load("./vectors.pkl")

#we can compute pairwise similarities (and look at the differences!
#let's try with concrete words:
print count_space.get_sim("school", "book", CosSimilarity())
print predict_space.get_sim("school", "book", CosSimilarity())

print count_space.get_sim("car", "book", CosSimilarity())
print predict_space.get_sim("car", "book", CosSimilarity())

#and now with abstract ones:
print count_space.get_sim("law", "politics", CosSimilarity())
print predict_space.get_sim("law", "politics", CosSimilarity())

#remember that the 'count' space we're using is the raw co-occurrence one! (1000 dimensions)

#we can compute the neighbours...
print count_space.get_neighbours("school", 10, CosSimilarity())
print predict_space.get_neighbours("school", 10, CosSimilarity())

#do not quit python!


##########################################################

#APPLY SIMPLE (BUT USUALLY VERY EFFECTIVE) ADDITIVE MODEL

###########################################################

#python2.7
#from composes.utils import io_utils
#from composes.similarity.cos import CosSimilarity
from composes.composition.weighted_additive import WeightedAdditive

#we load our word2vec space
#predict_space = io_utils.load("./vectors.pkl")

#instantiate a weighted additive model, in this case simple additive
#how to decide if it's a simple additive? By setting weights to 1!

my_comp = WeightedAdditive(alpha = 1, beta = 1)

#use the model to compose words in predict_space
composed_space = my_comp.compose([("good", "book", "good_book"),
                                  ("good", "car", "good_car"),
				  ("red", "shirt", "red_shirt"),
				  ("yellow", "shirt", "yellow_shirt"),
                        ("we", "need", "we_need")], 
                                 predict_space)

#we are composing words contained in the same space!

print composed_space.id2row
print composed_space.cooccurrence_matrix
#these are our composed (summed) vectors

print composed_space.get_sim("red_shirt", "yellow_shirt", CosSimilarity())
print composed_space.get_sim("good_book", "good_car", CosSimilarity())

#these are the similarities between the composed phrases

#QUESTION: What do we notice wrt similarities?

#we can also compute the similarity between a word in one space
#and a word/phrase in a different space

print predict_space.get_sim("car", "good_car", CosSimilarity(), 
                       space2 = composed_space)
print predict_space.get_sim("shirt", "yellow_shirt", CosSimilarity(), 
                       space2 = composed_space)

#QUESTION: What about the similarity between the AN phrase and A?

print predict_space.get_sim("good", "good_car", CosSimilarity(), 
                       space2 = composed_space)
print predict_space.get_sim("yellow", "yellow_shirt", CosSimilarity(), 
                       space2 = composed_space)


#finally, we can also find the nearest neighbours in one space
#given a vector in a different space

print predict_space.get_neighbours("car", 4, CosSimilarity(), 
                              space2 = composed_space)
print composed_space.get_neighbours("red_shirt", 10, CosSimilarity(), 
                              space2 = predict_space)


#we can also do it recursively, by obtaining phrases and sentences
#better (and easier) to do it with a script!
comp_space2 = my_comp.compose([("we_need", "good_book", "we_need_good_book"),
                              ("we_need", "good_car", "we_need_good_car")],
                               composed_space)

print comp_space2.get_neighbours("we_need_good_car", 10, CosSimilarity(), 
                              space2 = predict_space)


#(optional) save the composed space
#io_utils.save(composed_space, "./comp_space_toy.pkl")

###########################################################

#IT'S YOUR (first) MOMENT!

#let's see how weighting differently the constituents affects the composed vectors

# 1. set the weights to (alpha = 1, beta = 1)

# 2. build new space called 'composed_space2' containing representation for 'bull_dog'

# 3. get top 10 neighbours of composed 'bull_dog' in my_space

# 4. try the same pipeline with (alpha = 0.2, beta = 0.8) or whatever weights

# 5. what can you observe?

#you'd be asking yourself whether there's any way to 'learn'
#the weights to be applied in order to get a composed vector
#that approximates the original one... the answer is YES!
# --> see WEIGHTED ADDITIVE and FULL ADDITIVE MODELS below!


###########################################################

#APPLY A MULTIPLICATIVE MODEL

###########################################################

#remember: this model cannot be trained (yet)

from composes.composition.multiplicative import Multiplicative
my_comp = Multiplicative()
composed_space_mult = my_comp.compose([("sword", "fish", "sword_fish"),
                                  ("bull", "dog", "bull_dog")], 
                                 predict_space)

print composed_space_mult.get_neighbours("bull_dog", 10, CosSimilarity(), 
                              space2 = predict_space)
print composed_space_mult.get_neighbours("sword_fish", 10, CosSimilarity(), 
                              space2 = predict_space)

#Question: How does it work?


#How does it compare with simple additive?

my_comp = WeightedAdditive(alpha = 1, beta = 1)

composed_space_add = my_comp.compose([("sword", "fish", "sword_fish"),
                                  ("bull", "dog", "bull_dog")], 
                                 predict_space)
print composed_space_add.get_neighbours("bull_dog", 10, CosSimilarity(), 
                              space2 = predict_space)
print composed_space_add.get_neighbours("sword_fish", 10, CosSimilarity(), 
                              space2 = predict_space)


###########################################################

#TRAIN AND APPLY A WEIGHTED ADDITIVE MODEL

###########################################################

#all composition models that have parameters (all available composition models
#except Multiplicative) can be trained using examples of argument words
#and the corresponding output phrase vectors.
#all models are trained by minimizing the Euclidean norm of the difference
#between the composed phrase vectors as generated by the models
#and the corresponding phrase vectors passed as training data


from composes.utils import io_utils
from composes.composition.weighted_additive import WeightedAdditive

#training data
train_data = [("good", "car", "good_car"),
              ("good", "book", "good_book")]

#load an argument space
arg_space = io_utils.load("../../dissect/src/examples/data/out/ex10.pkl")
print arg_space.id2row
print arg_space.cooccurrence_matrix

#load a phrase space
phrase_space = io_utils.load("../../dissect/src/examples/data/out/PHRASE_SS.ex10.pkl")
print phrase_space.id2row
print phrase_space.cooccurrence_matrix

#train a weighted additive model on the data
my_comp = WeightedAdditive()
my_comp.train(train_data, arg_space, phrase_space)

#print its parameters
print "alpha:", my_comp.alpha
print "beta:", my_comp.beta

###########let's try with realistic data...

#training data
train_data = [("sword", "fish", "swordfish"),
              ("bull", "dog", "bulldog")]

#load the source space
arg_space = io_utils.load("./vectors.pkl")


#train a weighted additive model on the data
my_comp = WeightedAdditive()
my_comp.train(train_data, arg_space, arg_space)

#print its parameters
print "alpha:", my_comp.alpha
print "beta:", my_comp.beta

#we see that the 1st constituent (i.e. modifier) is way less important than the
#2nd constituent (i.e. the head) in these 2 examples!


##############testing phase
#once trained, the model and its parameters can be saved
#and used later with test data

#(optional) save it to pickle
#io_utils.save(my_comp, "../../dissect/src/examples/data/out/model_bulldog.pkl")
#print its parameters
#my_comp.export("../dissect/src/examples/data/out/model_bulldog.params")
#my_comp = io_utils.load("../../dissect/src/examples/data/out/model_bulldog.pkl")

#test data
test_data = [("air", "plane", "air_plane"),
              ("sun", "light", "sun_light")]

composed_space_test = my_comp.compose(test_data, arg_space)

print composed_space_test.get_neighbours("air_plane", 10, CosSimilarity(), 
                              space2 = arg_space)
print composed_space_test.get_neighbours("sun_light", 10, CosSimilarity(), 
                              space2 = arg_space)

#seems working, right? :)


###########################################################

#TRAIN AND APPLY A FULL ADDITIVE MODEL (cf. Guevara 2010)

###########################################################

#this model learns two matrices of weights to be applied
#to the first and to the second word.
#the matrices are learnt via Partial Least Square Regression
#or Ridge Regression, techniques aimed at minimizing 
#the Euclidean norm of the difference between the composed vector
#and the corresponding phrase vectors passed as training data

from composes.composition.full_additive import FullAdditive
my_comp = FullAdditive()

train_data2 = [("sword", "fish", "swordfish"),
              ("bull", "dog", "bulldog"),
	      ("basket", "ball", "basketball"),
	      ("boat", "house", "boathouse"),
              ("milk", "shake", "milkshake"),
              ("tennis", "table", "tennistable"),
              ("training", "shoes", "trainingshoes"),
              ("hand", "book", "handbook"),
	      ("mouse", "pad", "mousepad"),
	      ("pencil", "case", "pencilcase"),
	      ("summer", "time", "summertime"),
	      ("dog", "house", "doghouse"),
	      ("week", "end", "weekend"),
	      ("snow", "ball", "snowball"),
	      ("water", "fall", "waterfall")
              ]

my_comp.train(train_data2, arg_space, arg_space)

words_observed = arg_space.get_row2id()
for a,b,c in train_data2:
	if a not in words_observed:
	    print "not found: ",a
	    continue
	if b not in words_observed:
	    print "not found: ",b
	    continue
	if c not in words_observed:
	    print "not found: ",c
	    continue


#important! We might have the constituent vectors in one space
#and the 'phrase' vectors in a different, yet compatible space

print "\nA:", my_comp._mat_a_t.transpose()
print "B:", my_comp._mat_b_t.transpose()

#which is the shape of such matrices?
print my_comp._mat_a_t.transpose().shape

composed_space_fadd  = my_comp.compose(train_data2, arg_space)
print "Composed_fadd space:" 
print composed_space_fadd.id2row
print composed_space_fadd.cooccurrence_matrix

test_data2 = [("tennis", "table", "tennis_table"),
              ("dog", "house", "dog_house"),
	      ("training", "shoes", "training_shoes"),
              ("pencil", "case", "pencil_case"),
	      ("volley", "ball", "volley_ball"),
	      ("wind", "mill", "wind_mill")]

composed_space_fadd_test = my_comp.compose(test_data2, arg_space)
io_utils.save(composed_space_fadd_test, "./composed_space_compounds.pkl")

print composed_space_fadd_test.get_neighbours("tennis_table", 10, CosSimilarity(), 
                              space2 = arg_space)
print composed_space_fadd_test.get_neighbours("dog_house", 10, CosSimilarity(), 
                              space2 = arg_space)
print composed_space_fadd_test.get_neighbours("training_shoes", 10, CosSimilarity(), 
                              space2 = arg_space)

#these are compound words that are not in the original space!
#this means that we can generate representations for whatever pair of words!
#let's check some existing lexical entries...

###########################################################

#IT'S YOUR (second) MOMENT!

# 1. get top 10 neighbours for composed 'volley_ball' and 'wind_mill' in arg_space

# 2. report what you notice

###########################################################

#TRAIN AND APPLY A LEXICAL FUNCTION MODEL

###########################################################

#Lexical Function composition model differs from other models
#as its parameters are weight tensors for each functor word
#being trained. These weight tensors are stored as vectors
#in a semantic space. Thus, for DISSECT, the 'parameter' of a
#Lexical Function model is a semantic space


#based on exercise 3 here:
#http://clic.cimec.unitn.it/composes/toolkit/exercises.html

#TRAIN AND APPLY A COMPOSITION MODEL
# 1. Load training data from file DATA_PATH/training_pairs.txt
# 2. Build core space (argument space) and peripheral space containing only n-v
# 3. Train a lexical function model on the two spaces using Ridge Regression with lambda=2
# 4. Load testing pairs from DATA_PATH/testing_pairs.txt (list of elements to be composed)
# 5. Apply the trained lexical function model on these pairs and save the result phrase space
# 6. Print the top 10 neighbors of "conflict-n_erupt-v" in the composed phrase space
# 7. Print the top 10 neighbors of "conflict-n_erupt-v" in the argument space

quit()

cd ../dissect/src/examples/data/in/demo/
python2.7
from composes.utils import io_utils, log_utils
from composes.semantic_space.space import Space
from composes.semantic_space.peripheral_space import PeripheralSpace
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
from composes.transformation.dim_reduction.svd import Svd
from composes.composition.lexical_function import LexicalFunction
from composes.utils import regression_learner
from composes.similarity.cos import CosSimilarity

core_cooccurrence_file = "core.sm"
core_row_file = "core.rows"
core_col_file = "core.cols"

print "Building semantic space from co-occurrence counts"
core_space = Space.build(data=core_cooccurrence_file, rows=core_row_file, cols=core_col_file, format="sm")

print "Applying ppmi weighting"
core_space = core_space.apply(PpmiWeighting())
print "Applying svd 500"
core_space = core_space.apply(Svd(500))

per_cooccurrence_file = "sv.sm"
per_row_file = "sv.rows"
per_col_file = "sv.cols"

per_space = PeripheralSpace.build(core_space,data=per_cooccurrence_file, cols=per_col_file, rows=per_row_file, format="sm")
#containing subject-verb phrases

training_pair_file = "training_pairs.txt"
#file containing tuples <decline-v company-n company-n_decline-v>
testing_pair_file = "testing_pairs.txt"
#file containing tuples <rally-v ball-n ball-n_rally-v>
composed_space_file = "composed.pkl"

print "Reading in train data"
train_data = io_utils.read_tuple_list(training_pair_file, fields=[0,1,2])
    
print "Training Lexical Function compositional model"    
comp_model = LexicalFunction(learner = regression_learner.RidgeRegressionLearner(param=2))
comp_model.train(train_data, core_space, per_space)

print comp_model.function_space.id2row 
#print comp_model.function_space.get_sim("submit-v", "submit-v", CosSimilarity()) 

print "Composing phrases"
test_phrases = io_utils.read_tuple_list(testing_pair_file, fields=[0,1,2])
composed_space = comp_model.compose(test_phrases, core_space)
print composed_space.id2row

print "Saving composed space"
io_utils.save(composed_space, composed_space_file)

print "Finding neighbors of \"conflict-n_erupt-v\" in the composed space"
neighbors = composed_space.get_neighbours("conflict-n_erupt-v", 10, CosSimilarity())
print neighbors
    
print "Finding neighbors of \"conflict-n_erupt-v\" in the core space"
neighbors = composed_space.get_neighbours("conflict-n_erupt-v", 10, CosSimilarity(), core_space)
print neighbors

###########################################################

#IT'S YOUR (third) MOMENT!

# 1. launch python2.7 and load these packages
	from composes.utils import io_utils, log_utils
	from composes.semantic_space.space import Space
	from composes.composition.full_additive import FullAdditive
	from composes.similarity.cos import CosSimilarity

# 2. load the usual "vectors.pkl" as arg_space and "composed_space_compounds.pkl" as comp_space

# 3. train a FULLADD model using as training data the compounds in comp_space and nouns from arg_space

# 4. test the trained model on new (even unattested) combinations: "bike_girl", "thursday_class", etc.

# 5. find neighbours of just-built representations in arg_space

###########################################################

#END-WAY SUMMARY

#Today we learnt to:

# 1. create and apply a simple ADDITIVE model
# 2. save a so-called PERIPHERAL space
# 3. compute similarities and neighbours from different spaces
# 4. train and apply a weighted additive model
# 5. apply MULTIPLICATIVE model
# 6. train and apply a FULL ADDITIVE model
# 7. train and apply LEXICAL FUNCTION model
# 8. more in general: train, apply, save compositional models

###########################################################