NLP: PARAPHRASE DETECTION

Paraphrase Detection

NLP Paraphrase Detection

In [91]:
import numpy as np

Load data

In [92]:
Trfile = open('train.txt', 'r+')
Tefile = open('Test.txt','r+')

Read the Data

In [93]:
Datafile = Trfile.read()
Testfile = Tefile.read()
In [94]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

Data preprocessing

In [95]:
def get_data(Datafile,PLAY):
    
    TList = []
    Datapre =  Datafile.split('\n')
    for ele in Datapre :
        Textlist= ele.split('\t')
        TList.append(Textlist)

    if (PLAY =="TRAIN"):
        Quality = []
        ID1 = []
        ID2 = []
        String1 = []
        String2 = []
        for i in range(0,len(TList)-1):
            Quality.append(TList[i][0])
            ID1.append(TList[i][1])
            ID2.append(TList[i][2])
            String1.append(TList[i][3])
            String2.append(TList[i][4])
            
   
        return Quality, ID1,ID2,String1,String2
    
    if (PLAY =="TEST"):
       
        ID1 = []
        ID2 = []
        String1 = []
        String2 = []
        for i in range(0,len(TList)-1):
           
            ID1.append(TList[i][0])
            ID2.append(TList[i][1])
            String1.append(TList[i][2])
            String2.append(TList[i][3])
        return ID1,ID2,String1,String2

Extract the details

In [96]:
Train_Quality, Train_ID1,Train_ID2,Train_String1,Train_String2 = get_data(Datafile,PLAY = "TRAIN")
Test_ID1,Test_ID2,Test_String1,Test_String2 = get_data(Testfile,PLAY = "TEST")
Trainlabel = Train_Quality

Feature Learning Part

Since it is paraphrasing we need to compute some similarity measures like Euclidean distance, Cosine distance etc. which we choose as our feature.

In [97]:
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity,manhattan_distances
In [98]:
def Featurelearning(string1,string2):
    Feature1 = []
    Feature2 =[]
    Feature3 = []
    [r1,c1] = string1.shape
    TR1 = string1.todense()
    TR2 = string2.todense()
    distance.euclidean(TR1[1,:],TR2[1,:])
    for i in range(0,r1):
        Feature1.append(distance.euclidean(TR1[i,:],TR2[i,:]))
        Feature2.append(cosine_similarity(TR1[i,:],TR2[i,:]))
        Feature3.append(manhattan_distances(TR1[i,:],TR2[i,:]))
    Feature1 = (np.array(Feature1)).reshape(r1,1)
    Feature2 = (np.array(Feature2)).reshape(r1,1)
    Feature3 = (np.array(Feature3)).reshape(r1,1)
    Feature = np.append(Feature1,Feature2,axis=1)
    Feature_Mat = np.append(Feature,Feature3,axis = 1)
    return Feature_Mat

To choose the best parameters in Term Document Matrix we need to do Gridsearch

In [99]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.svm import SVC
In [100]:
def CROSSVALIDATION_REP(String1,String2,Trainlabel,VOC,min_dfrange,gram,randomstatemax,METHOD):
    
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    
    if (METHOD =="TDM"):
        Value = 0
        AvgAcc = 0
        
        for j in range(1,min_dfrange):
            
            for ngram in range(1,gram):
                
                from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
                voc = CountVectorizer(ngram_range=(1, ngram),max_df=1.0, min_df=j)
                voc.fit(VOC)
                TrainMat1 = voc.transform(String1)
                TrainMat2 = voc.transform(String2)
                TrainMat = Featurelearning(TrainMat1,TrainMat2)
                
                for i in range(0,randomstatemax):
            
                    X_train, X_test, y_train, y_test = train_test_split(TrainMat,Trainlabel, test_size=0.1, random_state=i)
                    clf = SVC(kernel='linear')
                    clf.fit(X_train, y_train)
                    Value = clf.score(X_test, y_test)*100
                    AvgAcc = AvgAcc + Value
                    print '-----ngram_range-1- ',ngram,'min_df',j,'-----Random_State:',i
                    print 'Accuracy in %', Value   
                print'#############'    
                print '--AVERAGE ACCURACY',AvgAcc/10
                AvgAcc = 0
                
    
    if (METHOD =="TFIDFM"):
        Value = 0
        AvgAcc = 0
        
        for j in range(1,min_dfrange):
            
            for ngram in range(1,gram):
                
                from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
                voc = TfidfVectorizer(ngram_range=(1, ngram),max_df=1.0, min_df=j)
                voc.fit(VOC)
                TrainMat1 = voc.transform(String1)
                TrainMat2 = voc.transform(String2)
                TrainMat = Featurelearning(TrainMat1,TrainMat2)
                
                for i in range(0,randomstatemax):
            
                    X_train, X_test, y_train, y_test = train_test_split(TrainMat,Trainlabel, test_size=0.1, random_state=i)
                    clf = SVC(kernel='linear')
                    clf.fit(X_train, y_train)
                    Value = clf.score(X_test, y_test)*100
                    AvgAcc = AvgAcc + Value
                    print '-----ngram_range-1- ',ngram,'min_df',j,'-----Random_State:',i
                    print 'Accuracy in %', Value   
                print'#############'    
                print '--AVERAGE ACCURACY',AvgAcc/10
                AvgAcc = 0
In [101]:
CROSSVALIDATION_REP(Train_String1,Train_String2,Trainlabel,Train_String1+Train_String2,min_dfrange=5,gram=2,randomstatemax=10,METHOD="TDM")
-----ngram_range-1-  1 min_df 1 -----Random_State: 0
Accuracy in % 71.8137254902
-----ngram_range-1-  1 min_df 1 -----Random_State: 1
Accuracy in % 72.0588235294
-----ngram_range-1-  1 min_df 1 -----Random_State: 2
Accuracy in % 73.0392156863
-----ngram_range-1-  1 min_df 1 -----Random_State: 3
Accuracy in % 73.5294117647
-----ngram_range-1-  1 min_df 1 -----Random_State: 4
Accuracy in % 71.3235294118
-----ngram_range-1-  1 min_df 1 -----Random_State: 5
Accuracy in % 73.0392156863
-----ngram_range-1-  1 min_df 1 -----Random_State: 6
Accuracy in % 71.8137254902
-----ngram_range-1-  1 min_df 1 -----Random_State: 7
Accuracy in % 72.3039215686
-----ngram_range-1-  1 min_df 1 -----Random_State: 8
Accuracy in % 69.6078431373
-----ngram_range-1-  1 min_df 1 -----Random_State: 9
Accuracy in % 72.7941176471
#############
--AVERAGE ACCURACY 72.1323529412
-----ngram_range-1-  1 min_df 2 -----Random_State: 0
Accuracy in % 71.3235294118
-----ngram_range-1-  1 min_df 2 -----Random_State: 1
Accuracy in % 70.0980392157
-----ngram_range-1-  1 min_df 2 -----Random_State: 2
Accuracy in % 72.7941176471
-----ngram_range-1-  1 min_df 2 -----Random_State: 3
Accuracy in % 72.7941176471
-----ngram_range-1-  1 min_df 2 -----Random_State: 4
Accuracy in % 69.362745098
-----ngram_range-1-  1 min_df 2 -----Random_State: 5
Accuracy in % 71.8137254902
-----ngram_range-1-  1 min_df 2 -----Random_State: 6
Accuracy in % 70.3431372549
-----ngram_range-1-  1 min_df 2 -----Random_State: 7
Accuracy in % 70.3431372549
-----ngram_range-1-  1 min_df 2 -----Random_State: 8
Accuracy in % 66.6666666667
-----ngram_range-1-  1 min_df 2 -----Random_State: 9
Accuracy in % 72.5490196078
#############
--AVERAGE ACCURACY 70.8088235294
-----ngram_range-1-  1 min_df 3 -----Random_State: 0
Accuracy in % 71.0784313725
-----ngram_range-1-  1 min_df 3 -----Random_State: 1
Accuracy in % 70.8333333333
-----ngram_range-1-  1 min_df 3 -----Random_State: 2
Accuracy in % 72.5490196078
-----ngram_range-1-  1 min_df 3 -----Random_State: 3
Accuracy in % 72.7941176471
-----ngram_range-1-  1 min_df 3 -----Random_State: 4
Accuracy in % 69.6078431373
-----ngram_range-1-  1 min_df 3 -----Random_State: 5
Accuracy in % 72.3039215686
-----ngram_range-1-  1 min_df 3 -----Random_State: 6
Accuracy in % 70.0980392157
-----ngram_range-1-  1 min_df 3 -----Random_State: 7
Accuracy in % 70.8333333333
-----ngram_range-1-  1 min_df 3 -----Random_State: 8
Accuracy in % 66.9117647059
-----ngram_range-1-  1 min_df 3 -----Random_State: 9
Accuracy in % 70.8333333333
#############
--AVERAGE ACCURACY 70.7843137255
-----ngram_range-1-  1 min_df 4 -----Random_State: 0
Accuracy in % 70.8333333333
-----ngram_range-1-  1 min_df 4 -----Random_State: 1
Accuracy in % 70.8333333333
-----ngram_range-1-  1 min_df 4 -----Random_State: 2
Accuracy in % 71.8137254902
-----ngram_range-1-  1 min_df 4 -----Random_State: 3
Accuracy in % 72.7941176471
-----ngram_range-1-  1 min_df 4 -----Random_State: 4
Accuracy in % 68.6274509804
-----ngram_range-1-  1 min_df 4 -----Random_State: 5
Accuracy in % 72.5490196078
-----ngram_range-1-  1 min_df 4 -----Random_State: 6
Accuracy in % 69.362745098
-----ngram_range-1-  1 min_df 4 -----Random_State: 7
Accuracy in % 71.0784313725
-----ngram_range-1-  1 min_df 4 -----Random_State: 8
Accuracy in % 67.6470588235
-----ngram_range-1-  1 min_df 4 -----Random_State: 9
Accuracy in % 71.0784313725
#############
--AVERAGE ACCURACY 70.6617647059

In the case of Term document matrix the average accuracy is high for min_df =1 and n_gram =(1,1)

Building the Vocabulary

In [102]:
def vocabularymat(TEXTFILES,VOC,min_df,ngram,PLAY,METHOD):
    if (METHOD == "TDM"):
        from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
        voc = CountVectorizer(ngram_range=(1,ngram),max_df = 1.0,min_df = min_df)
        voc.fit(VOC)
        if (PLAY == "TRAIN"):
            TrainMat = voc.transform(TEXTFILES)
            return TrainMat
    
        if (PLAY =="TEST"):
            TestMat = voc.transform(TEXTFILES)
            return TestMat
    if (METHOD == "TFIDFM"):
        from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
        voc = TfidfVectorizer(ngram_range=(1,ngram),max_df = 1.0,min_df = min_df)
        voc.fit(VOC)
        if (PLAY == "TRAIN"):
            TrainMat = voc.transform(TEXTFILES)
            return TrainMat
    
        if (PLAY =="TEST"):
            TestMat = voc.transform(TEXTFILES)
            return TestMat
In [103]:
TR_String1 = vocabularymat( Train_String1,Train_String1+Train_String2,min_df = 1,ngram=1, PLAY="TRAIN",METHOD="TDM")
TR_String2 = vocabularymat( Train_String2,Train_String1+Train_String2,min_df = 1,ngram=1, PLAY="TRAIN",METHOD="TDM")
trainlabel = Train_Quality
TE_String1 = vocabularymat(Test_String1,Test_String1 +Test_String2,min_df = 1,ngram=1, PLAY="TEST",METHOD="TDM")
TE_String2 = vocabularymat(Test_String2,Test_String1 +Test_String2,min_df = 1,ngram=1, PLAY="TEST",METHOD="TDM")
In [104]:
Train_Mat = Featurelearning(TR_String1,TR_String2)
In [105]:
Test_Mat = Featurelearning(TE_String1,TE_String2)

Importing Relevant packages

In [106]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.svm import SVC

Cross Validation to choose the best hyper parameter

In [107]:
def CROSSVALIDATION(TRAINDATA,Trainlabel,Maxnocomp,step,randomstatemax,METHOD):
    if (METHOD =="SVD"):
        Value = 0
        AvgAcc = 0
        for j in range(10,Maxnocomp,step):    
            svd = TruncatedSVD(n_components=j, n_iter=7, random_state=42)
            SVD_Matrix = svd.fit_transform(TRAINDATA)  
            for i in range(0,randomstatemax):
            
                X_train, X_test, y_train, y_test = train_test_split(SVD_Matrix,Trainlabel, test_size=0.1, random_state=i)
                clf = SVC(kernel='linear')
                clf.fit(X_train, y_train)
                Value = clf.score(X_test, y_test)*100
                AvgAcc = AvgAcc + Value
                print '-----No of Components:',j,'-----Random_State:',i
                print 'Accuracy in %', Value    
            print 'For',j,'components''--AVERAGE ACCURACY',AvgAcc/10
            AvgAcc = 0
    if (METHOD == "NMF"):
        Value = 0
        AvgAcc = 0
        for j in range(10,Maxnocomp,step):    
            MODEL = NMF(n_components=j, init='random', random_state=0)
            W= MODEL.fit_transform(TRAINDATA) 
            
            for i in range(0,randomstatemax):
            
                X_train, X_test, y_train, y_test = train_test_split(W,Trainlabel, test_size=0.1, random_state=i)
                clf = SVC(kernel='linear')
                clf.fit(X_train, y_train)
                Value = clf.score(X_test, y_test)*100
                AvgAcc = AvgAcc + Value
                print '-----No of Components:',j,'-----Random_State:',i
                print 'Accuracy in %', Value    
            print 'For',j,'components''--AVERAGE ACCURACY',AvgAcc/10
            AvgAcc = 0
            
    if (METHOD == "NULL"):
        Value = 0
        AvgAcc = 0
       
        for i in range(0,randomstatemax):
            
            X_train, X_test, y_train, y_test = train_test_split(TRAINDATA,Trainlabel, test_size=0.1, random_state=i)
            clf = SVC(kernel='linear')
            clf.fit(X_train, y_train)
            Value = clf.score(X_test, y_test)*100
            AvgAcc = AvgAcc + Value
            print '-------Random_State:',i
            print 'Accuracy in %', Value    
        print '--AVERAGE ACCURACY',AvgAcc/10
        AvgAcc = 0
In [108]:
CROSSVALIDATION(Train_Mat,trainlabel,300,20,10,METHOD="NULL")
-------Random_State: 0
Accuracy in % 71.8137254902
-------Random_State: 1
Accuracy in % 72.0588235294
-------Random_State: 2
Accuracy in % 73.0392156863
-------Random_State: 3
Accuracy in % 73.5294117647
-------Random_State: 4
Accuracy in % 71.3235294118
-------Random_State: 5
Accuracy in % 73.0392156863
-------Random_State: 6
Accuracy in % 71.8137254902
-------Random_State: 7
Accuracy in % 72.3039215686
-------Random_State: 8
Accuracy in % 69.6078431373
-------Random_State: 9
Accuracy in % 72.7941176471
--AVERAGE ACCURACY 72.1323529412

Model fixing from the hyperparameter fixed after cross validation

In [109]:
clf = SVC(kernel='linear')
clf.fit(Train_Mat, trainlabel)
Out[109]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

Training Accuracy

In [110]:
print clf.score(Train_Mat,trainlabel)*100
72.6269315673

To save the model

In [111]:
import pickle
from sklearn.externals import joblib
joblib.dump(clf, 'model_para_tdm.pkl')
Out[111]:
['model_para_tdm.pkl']

Load the model and Testing

In [112]:
model = joblib.load('model_para_tdm.pkl') 
predicted = model.predict(Test_Mat)
predicted = predicted.astype(int)
np.savetxt('predictedlabel_oara_tdm.txt',predicted)
In [114]:
PredictedList=predicted.tolist()
print PredictedList.count(1)
print PredictedList.count(0)
1387
339

Now using TFIDF MATRIX

In [115]:
CROSSVALIDATION_REP(Train_String1,Train_String2,Trainlabel,Train_String1+Train_String2,min_dfrange=5,gram=2,randomstatemax=10,METHOD="TFIDFM")
-----ngram_range-1-  1 min_df 1 -----Random_State: 0
Accuracy in % 73.0392156863
-----ngram_range-1-  1 min_df 1 -----Random_State: 1
Accuracy in % 72.7941176471
-----ngram_range-1-  1 min_df 1 -----Random_State: 2
Accuracy in % 75.2450980392
-----ngram_range-1-  1 min_df 1 -----Random_State: 3
Accuracy in % 73.5294117647
-----ngram_range-1-  1 min_df 1 -----Random_State: 4
Accuracy in % 72.0588235294
-----ngram_range-1-  1 min_df 1 -----Random_State: 5
Accuracy in % 71.568627451
-----ngram_range-1-  1 min_df 1 -----Random_State: 6
Accuracy in % 71.8137254902
-----ngram_range-1-  1 min_df 1 -----Random_State: 7
Accuracy in % 70.5882352941
-----ngram_range-1-  1 min_df 1 -----Random_State: 8
Accuracy in % 70.8333333333
-----ngram_range-1-  1 min_df 1 -----Random_State: 9
Accuracy in % 71.568627451
#############
--AVERAGE ACCURACY 72.3039215686
-----ngram_range-1-  1 min_df 2 -----Random_State: 0
Accuracy in % 70.3431372549
-----ngram_range-1-  1 min_df 2 -----Random_State: 1
Accuracy in % 71.3235294118
-----ngram_range-1-  1 min_df 2 -----Random_State: 2
Accuracy in % 73.5294117647
-----ngram_range-1-  1 min_df 2 -----Random_State: 3
Accuracy in % 71.0784313725
-----ngram_range-1-  1 min_df 2 -----Random_State: 4
Accuracy in % 69.8529411765
-----ngram_range-1-  1 min_df 2 -----Random_State: 5
Accuracy in % 71.568627451
-----ngram_range-1-  1 min_df 2 -----Random_State: 6
Accuracy in % 69.6078431373
-----ngram_range-1-  1 min_df 2 -----Random_State: 7
Accuracy in % 69.6078431373
-----ngram_range-1-  1 min_df 2 -----Random_State: 8
Accuracy in % 67.1568627451
-----ngram_range-1-  1 min_df 2 -----Random_State: 9
Accuracy in % 69.362745098
#############
--AVERAGE ACCURACY 70.3431372549
-----ngram_range-1-  1 min_df 3 -----Random_State: 0
Accuracy in % 71.0784313725
-----ngram_range-1-  1 min_df 3 -----Random_State: 1
Accuracy in % 71.568627451
-----ngram_range-1-  1 min_df 3 -----Random_State: 2
Accuracy in % 73.0392156863
-----ngram_range-1-  1 min_df 3 -----Random_State: 3
Accuracy in % 71.568627451
-----ngram_range-1-  1 min_df 3 -----Random_State: 4
Accuracy in % 67.6470588235
-----ngram_range-1-  1 min_df 3 -----Random_State: 5
Accuracy in % 71.568627451
-----ngram_range-1-  1 min_df 3 -----Random_State: 6
Accuracy in % 69.6078431373
-----ngram_range-1-  1 min_df 3 -----Random_State: 7
Accuracy in % 69.6078431373
-----ngram_range-1-  1 min_df 3 -----Random_State: 8
Accuracy in % 68.3823529412
-----ngram_range-1-  1 min_df 3 -----Random_State: 9
Accuracy in % 69.6078431373
#############
--AVERAGE ACCURACY 70.3676470588
-----ngram_range-1-  1 min_df 4 -----Random_State: 0
Accuracy in % 70.5882352941
-----ngram_range-1-  1 min_df 4 -----Random_State: 1
Accuracy in % 71.8137254902
-----ngram_range-1-  1 min_df 4 -----Random_State: 2
Accuracy in % 70.3431372549
-----ngram_range-1-  1 min_df 4 -----Random_State: 3
Accuracy in % 72.3039215686
-----ngram_range-1-  1 min_df 4 -----Random_State: 4
Accuracy in % 70.0980392157
-----ngram_range-1-  1 min_df 4 -----Random_State: 5
Accuracy in % 72.5490196078
-----ngram_range-1-  1 min_df 4 -----Random_State: 6
Accuracy in % 69.6078431373
-----ngram_range-1-  1 min_df 4 -----Random_State: 7
Accuracy in % 69.6078431373
-----ngram_range-1-  1 min_df 4 -----Random_State: 8
Accuracy in % 68.137254902
-----ngram_range-1-  1 min_df 4 -----Random_State: 9
Accuracy in % 70.0980392157
#############
--AVERAGE ACCURACY 70.5147058824

In the case of Term Frequency and Inverse document Frquency matrix the average accuracy is high for min_df =1 and n_gram =(1,1)

In [116]:
TR_String1 = vocabularymat( Train_String1,Train_String1+Train_String2,min_df = 1,ngram=1, PLAY="TRAIN",METHOD="TFIDFM")
TR_String2 = vocabularymat( Train_String2,Train_String1+Train_String2,min_df = 1,ngram=1, PLAY="TRAIN",METHOD="TFIDFM")
trainlabel = Train_Quality
TE_String1 = vocabularymat(Test_String1,Test_String1 +Test_String2,min_df = 1,ngram=1, PLAY="TEST",METHOD="TFIDFM")
TE_String2 = vocabularymat(Test_String2,Test_String1 +Test_String2,min_df = 1,ngram=1, PLAY="TEST",METHOD="TFIDFM")
In [117]:
Train_Mat = Featurelearning(TR_String1,TR_String2)
In [118]:
Test_Mat = Featurelearning(TE_String1,TE_String2)
In [119]:
CROSSVALIDATION(Train_Mat,trainlabel,300,20,10,METHOD="NULL")
-------Random_State: 0
Accuracy in % 73.0392156863
-------Random_State: 1
Accuracy in % 72.7941176471
-------Random_State: 2
Accuracy in % 75.2450980392
-------Random_State: 3
Accuracy in % 73.5294117647
-------Random_State: 4
Accuracy in % 72.0588235294
-------Random_State: 5
Accuracy in % 71.568627451
-------Random_State: 6
Accuracy in % 71.8137254902
-------Random_State: 7
Accuracy in % 70.5882352941
-------Random_State: 8
Accuracy in % 70.8333333333
-------Random_State: 9
Accuracy in % 71.568627451
--AVERAGE ACCURACY 72.3039215686

Model fixing from the hyperparameter fixed after cross validation

In [120]:
clf = SVC(kernel='linear')
clf.fit(Train_Mat, trainlabel)
Out[120]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

Training Accuracy

In [121]:
print clf.score(Train_Mat,trainlabel)*100
72.4061810155

To save the model

In [90]:
import pickle
from sklearn.externals import joblib
joblib.dump(clf, 'model_para_tfidf.pkl')
Out[90]:
['model_para_tfidf.pkl']

Load the model and Testing

In [122]:
model = joblib.load('model_para_tfidf.pkl') 
predicted = model.predict(Test_Mat)
predicted = predicted.astype(int)
np.savetxt('predictedlabel_para_tfidf.txt',predicted)
In [123]:
PredictedList=predicted.tolist()
print PredictedList.count(1)
print PredictedList.count(0)
1271
455

Comments