########################################################### #Computational Linguistics 18-19 #DISSECT tutorial #How to: PLAY WITH DISSECT - COMPOSITIONAL DSMs #sandro.pezzelle@unitn.it ########################################################### #demos and tutorials can be found here: #http://clic.cimec.unitn.it/composes/toolkit/ #here we will use some of the tutorials at the page: #http://clic.cimec.unitn.it/composes/toolkit/composing.html #more info on compositional models in DISSECT: #http://clic.cimec.unitn.it/composes/toolkit/compose.html ########################################################### # CHECK-POINT QUESTIONS: # 1. Why COSINE for measuring similarity? # 2. What's the intuition behind applying SVD to vectors? # 3. Why do we need COMPOSITIONAL MODELS? ########################################################### #A BRIEF RECAP: LOAD SPACE, COMPUTE COSINE SIM, NEIGHBORS ########################################################### #You can use pre-trained, existing embeddings like w2v, glove, etc. #here below some repositories: #https://github.com/3Top/word2vec-api #https://github.com/Hironsan/awesome-embedding-models #I suggest using gensim for building spaces from raw text #https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec #you should already have it installed... #to simplify things, I provided you with a semantic space #built using SoA word2vec. You downloaded the file in .bin, #which is the default format you get out of word2vec et similia #we will now convert it into .pkl using a script that does the job #then we'll play with the DISSECT toolkit on it. READY? cd ~ cd dissect-env/class/class-stuff/ #if you didn't do it already... #python2.7 bin-to-pkl.py #to have the .pkl version of the space python2.7 from composes.semantic_space.space import Space from composes.similarity.cos import CosSimilarity from composes.utils import io_utils from composes.utils import log_utils count_space = io_utils.load("./text8.pkl") predict_space= io_utils.load("./vectors.pkl") #we can compute pairwise similarities (and look at the differences! #let's try with concrete words: print count_space.get_sim("school", "book", CosSimilarity()) print predict_space.get_sim("school", "book", CosSimilarity()) print count_space.get_sim("car", "book", CosSimilarity()) print predict_space.get_sim("car", "book", CosSimilarity()) #and now with abstract ones: print count_space.get_sim("law", "politics", CosSimilarity()) print predict_space.get_sim("law", "politics", CosSimilarity()) #remember that the 'count' space we're using is the raw co-occurrence one! (1000 dimensions) #we can compute the neighbours... print count_space.get_neighbours("school", 10, CosSimilarity()) print predict_space.get_neighbours("school", 10, CosSimilarity()) #do not quit python! ########################################################## #APPLY SIMPLE (BUT USUALLY VERY EFFECTIVE) ADDITIVE MODEL ########################################################### #python2.7 #from composes.utils import io_utils #from composes.similarity.cos import CosSimilarity from composes.composition.weighted_additive import WeightedAdditive #we load our word2vec space #predict_space = io_utils.load("./vectors.pkl") #instantiate a weighted additive model, in this case simple additive #how to decide if it's a simple additive? By setting weights to 1! my_comp = WeightedAdditive(alpha = 1, beta = 1) #use the model to compose words in predict_space composed_space = my_comp.compose([("good", "book", "good_book"), ("good", "car", "good_car"), ("red", "shirt", "red_shirt"), ("yellow", "shirt", "yellow_shirt"), ("we", "need", "we_need")], predict_space) #we are composing words contained in the same space! print composed_space.id2row print composed_space.cooccurrence_matrix #these are our composed (summed) vectors print composed_space.get_sim("red_shirt", "yellow_shirt", CosSimilarity()) print composed_space.get_sim("good_book", "good_car", CosSimilarity()) #these are the similarities between the composed phrases #QUESTION: What do we notice wrt similarities? #we can also compute the similarity between a word in one space #and a word/phrase in a different space print predict_space.get_sim("car", "good_car", CosSimilarity(), space2 = composed_space) print predict_space.get_sim("shirt", "yellow_shirt", CosSimilarity(), space2 = composed_space) #QUESTION: What about the similarity between the AN phrase and A? print predict_space.get_sim("good", "good_car", CosSimilarity(), space2 = composed_space) print predict_space.get_sim("yellow", "yellow_shirt", CosSimilarity(), space2 = composed_space) #finally, we can also find the nearest neighbours in one space #given a vector in a different space print predict_space.get_neighbours("car", 4, CosSimilarity(), space2 = composed_space) print composed_space.get_neighbours("red_shirt", 10, CosSimilarity(), space2 = predict_space) #we can also do it recursively, by obtaining phrases and sentences #better (and easier) to do it with a script! comp_space2 = my_comp.compose([("we_need", "good_book", "we_need_good_book"), ("we_need", "good_car", "we_need_good_car")], composed_space) print comp_space2.get_neighbours("we_need_good_car", 10, CosSimilarity(), space2 = predict_space) #(optional) save the composed space #io_utils.save(composed_space, "./comp_space_toy.pkl") ########################################################### #IT'S YOUR (first) MOMENT! #let's see how weighting differently the constituents affects the composed vectors # 1. set the weights to (alpha = 1, beta = 1) # 2. build new space called 'composed_space2' containing representation for 'bull_dog' # 3. get top 10 neighbours of composed 'bull_dog' in my_space # 4. try the same pipeline with (alpha = 0.2, beta = 0.8) or whatever weights # 5. what can you observe? #you'd be asking yourself whether there's any way to 'learn' #the weights to be applied in order to get a composed vector #that approximates the original one... the answer is YES! # --> see WEIGHTED ADDITIVE and FULL ADDITIVE MODELS below! ########################################################### #APPLY A MULTIPLICATIVE MODEL ########################################################### #remember: this model cannot be trained (yet) from composes.composition.multiplicative import Multiplicative my_comp = Multiplicative() composed_space_mult = my_comp.compose([("sword", "fish", "sword_fish"), ("bull", "dog", "bull_dog")], predict_space) print composed_space_mult.get_neighbours("bull_dog", 10, CosSimilarity(), space2 = predict_space) print composed_space_mult.get_neighbours("sword_fish", 10, CosSimilarity(), space2 = predict_space) #Question: How does it work? #How does it compare with simple additive? my_comp = WeightedAdditive(alpha = 1, beta = 1) composed_space_add = my_comp.compose([("sword", "fish", "sword_fish"), ("bull", "dog", "bull_dog")], predict_space) print composed_space_add.get_neighbours("bull_dog", 10, CosSimilarity(), space2 = predict_space) print composed_space_add.get_neighbours("sword_fish", 10, CosSimilarity(), space2 = predict_space) ########################################################### #TRAIN AND APPLY A WEIGHTED ADDITIVE MODEL ########################################################### #all composition models that have parameters (all available composition models #except Multiplicative) can be trained using examples of argument words #and the corresponding output phrase vectors. #all models are trained by minimizing the Euclidean norm of the difference #between the composed phrase vectors as generated by the models #and the corresponding phrase vectors passed as training data from composes.utils import io_utils from composes.composition.weighted_additive import WeightedAdditive #training data train_data = [("good", "car", "good_car"), ("good", "book", "good_book")] #load an argument space arg_space = io_utils.load("../../dissect/src/examples/data/out/ex10.pkl") print arg_space.id2row print arg_space.cooccurrence_matrix #load a phrase space phrase_space = io_utils.load("../../dissect/src/examples/data/out/PHRASE_SS.ex10.pkl") print phrase_space.id2row print phrase_space.cooccurrence_matrix #train a weighted additive model on the data my_comp = WeightedAdditive() my_comp.train(train_data, arg_space, phrase_space) #print its parameters print "alpha:", my_comp.alpha print "beta:", my_comp.beta ###########let's try with realistic data... #training data train_data = [("sword", "fish", "swordfish"), ("bull", "dog", "bulldog")] #load the source space arg_space = io_utils.load("./vectors.pkl") #train a weighted additive model on the data my_comp = WeightedAdditive() my_comp.train(train_data, arg_space, arg_space) #print its parameters print "alpha:", my_comp.alpha print "beta:", my_comp.beta #we see that the 1st constituent (i.e. modifier) is way less important than the #2nd constituent (i.e. the head) in these 2 examples! ##############testing phase #once trained, the model and its parameters can be saved #and used later with test data #(optional) save it to pickle #io_utils.save(my_comp, "../../dissect/src/examples/data/out/model_bulldog.pkl") #print its parameters #my_comp.export("../dissect/src/examples/data/out/model_bulldog.params") #my_comp = io_utils.load("../../dissect/src/examples/data/out/model_bulldog.pkl") #test data test_data = [("air", "plane", "air_plane"), ("sun", "light", "sun_light")] composed_space_test = my_comp.compose(test_data, arg_space) print composed_space_test.get_neighbours("air_plane", 10, CosSimilarity(), space2 = arg_space) print composed_space_test.get_neighbours("sun_light", 10, CosSimilarity(), space2 = arg_space) #seems working, right? :) ########################################################### #TRAIN AND APPLY A FULL ADDITIVE MODEL (cf. Guevara 2010) ########################################################### #this model learns two matrices of weights to be applied #to the first and to the second word. #the matrices are learnt via Partial Least Square Regression #or Ridge Regression, techniques aimed at minimizing #the Euclidean norm of the difference between the composed vector #and the corresponding phrase vectors passed as training data from composes.composition.full_additive import FullAdditive my_comp = FullAdditive() train_data2 = [("sword", "fish", "swordfish"), ("bull", "dog", "bulldog"), ("basket", "ball", "basketball"), ("boat", "house", "boathouse"), ("milk", "shake", "milkshake"), ("tennis", "table", "tennistable"), ("training", "shoes", "trainingshoes"), ("hand", "book", "handbook"), ("mouse", "pad", "mousepad"), ("pencil", "case", "pencilcase"), ("summer", "time", "summertime"), ("dog", "house", "doghouse"), ("week", "end", "weekend"), ("snow", "ball", "snowball"), ("water", "fall", "waterfall") ] my_comp.train(train_data2, arg_space, arg_space) words_observed = arg_space.get_row2id() for a,b,c in train_data2: if a not in words_observed: print "not found: ",a continue if b not in words_observed: print "not found: ",b continue if c not in words_observed: print "not found: ",c continue #important! We might have the constituent vectors in one space #and the 'phrase' vectors in a different, yet compatible space print "\nA:", my_comp._mat_a_t.transpose() print "B:", my_comp._mat_b_t.transpose() #which is the shape of such matrices? print my_comp._mat_a_t.transpose().shape composed_space_fadd = my_comp.compose(train_data2, arg_space) print "Composed_fadd space:" print composed_space_fadd.id2row print composed_space_fadd.cooccurrence_matrix test_data2 = [("tennis", "table", "tennis_table"), ("dog", "house", "dog_house"), ("training", "shoes", "training_shoes"), ("pencil", "case", "pencil_case"), ("volley", "ball", "volley_ball"), ("wind", "mill", "wind_mill")] composed_space_fadd_test = my_comp.compose(test_data2, arg_space) io_utils.save(composed_space_fadd_test, "./composed_space_compounds.pkl") print composed_space_fadd_test.get_neighbours("tennis_table", 10, CosSimilarity(), space2 = arg_space) print composed_space_fadd_test.get_neighbours("dog_house", 10, CosSimilarity(), space2 = arg_space) print composed_space_fadd_test.get_neighbours("training_shoes", 10, CosSimilarity(), space2 = arg_space) #these are compound words that are not in the original space! #this means that we can generate representations for whatever pair of words! #let's check some existing lexical entries... ########################################################### #IT'S YOUR (second) MOMENT! # 1. get top 10 neighbours for composed 'volley_ball' and 'wind_mill' in arg_space # 2. report what you notice ########################################################### #TRAIN AND APPLY A LEXICAL FUNCTION MODEL ########################################################### #Lexical Function composition model differs from other models #as its parameters are weight tensors for each functor word #being trained. These weight tensors are stored as vectors #in a semantic space. Thus, for DISSECT, the 'parameter' of a #Lexical Function model is a semantic space #based on exercise 3 here: #http://clic.cimec.unitn.it/composes/toolkit/exercises.html #TRAIN AND APPLY A COMPOSITION MODEL # 1. Load training data from file DATA_PATH/training_pairs.txt # 2. Build core space (argument space) and peripheral space containing only n-v # 3. Train a lexical function model on the two spaces using Ridge Regression with lambda=2 # 4. Load testing pairs from DATA_PATH/testing_pairs.txt (list of elements to be composed) # 5. Apply the trained lexical function model on these pairs and save the result phrase space # 6. Print the top 10 neighbors of "conflict-n_erupt-v" in the composed phrase space # 7. Print the top 10 neighbors of "conflict-n_erupt-v" in the argument space quit() cd ../dissect/src/examples/data/in/demo/ python2.7 from composes.utils import io_utils, log_utils from composes.semantic_space.space import Space from composes.semantic_space.peripheral_space import PeripheralSpace from composes.transformation.scaling.ppmi_weighting import PpmiWeighting from composes.transformation.dim_reduction.svd import Svd from composes.composition.lexical_function import LexicalFunction from composes.utils import regression_learner from composes.similarity.cos import CosSimilarity core_cooccurrence_file = "core.sm" core_row_file = "core.rows" core_col_file = "core.cols" print "Building semantic space from co-occurrence counts" core_space = Space.build(data=core_cooccurrence_file, rows=core_row_file, cols=core_col_file, format="sm") print "Applying ppmi weighting" core_space = core_space.apply(PpmiWeighting()) print "Applying svd 500" core_space = core_space.apply(Svd(500)) per_cooccurrence_file = "sv.sm" per_row_file = "sv.rows" per_col_file = "sv.cols" per_space = PeripheralSpace.build(core_space,data=per_cooccurrence_file, cols=per_col_file, rows=per_row_file, format="sm") #containing subject-verb phrases training_pair_file = "training_pairs.txt" #file containing tuples testing_pair_file = "testing_pairs.txt" #file containing tuples composed_space_file = "composed.pkl" print "Reading in train data" train_data = io_utils.read_tuple_list(training_pair_file, fields=[0,1,2]) print "Training Lexical Function compositional model" comp_model = LexicalFunction(learner = regression_learner.RidgeRegressionLearner(param=2)) comp_model.train(train_data, core_space, per_space) print comp_model.function_space.id2row #print comp_model.function_space.get_sim("submit-v", "submit-v", CosSimilarity()) print "Composing phrases" test_phrases = io_utils.read_tuple_list(testing_pair_file, fields=[0,1,2]) composed_space = comp_model.compose(test_phrases, core_space) print composed_space.id2row print "Saving composed space" io_utils.save(composed_space, composed_space_file) print "Finding neighbors of \"conflict-n_erupt-v\" in the composed space" neighbors = composed_space.get_neighbours("conflict-n_erupt-v", 10, CosSimilarity()) print neighbors print "Finding neighbors of \"conflict-n_erupt-v\" in the core space" neighbors = composed_space.get_neighbours("conflict-n_erupt-v", 10, CosSimilarity(), core_space) print neighbors ########################################################### #IT'S YOUR (third) MOMENT! # 1. launch python2.7 and load these packages from composes.utils import io_utils, log_utils from composes.semantic_space.space import Space from composes.composition.full_additive import FullAdditive from composes.similarity.cos import CosSimilarity # 2. load the usual "vectors.pkl" as arg_space and "composed_space_compounds.pkl" as comp_space # 3. train a FULLADD model using as training data the compounds in comp_space and nouns from arg_space # 4. test the trained model on new (even unattested) combinations: "bike_girl", "thursday_class", etc. # 5. find neighbours of just-built representations in arg_space ########################################################### #END-WAY SUMMARY #Today we learnt to: # 1. create and apply a simple ADDITIVE model # 2. save a so-called PERIPHERAL space # 3. compute similarities and neighbours from different spaces # 4. train and apply a weighted additive model # 5. apply MULTIPLICATIVE model # 6. train and apply a FULL ADDITIVE model # 7. train and apply LEXICAL FUNCTION model # 8. more in general: train, apply, save compositional models ###########################################################