Commit 32d01082 authored by Daniel's avatar Daniel
Browse files

conclusion-premise similarity distribution analysis

parent f9c286e6
# %%
# modules
import pandas as pd
import numpy as np
import csv
import itertools
from sentence_transformers import SentenceTransformer, util
import time
import ast
import matplotlib.pyplot as plt
import seaborn as sns
# %%
# Define BERT Model and data path
model = SentenceTransformer('all-MiniLM-L6-v2')
filepath = '../data/args_processed.csv'
# %%
def read_file(filepath: str):
"""
Function to read large files rowwise
"""
with open(filepath, encoding='UTF-8') as csvfile:
datareader = csv.reader(csvfile)
yield from datareader
# %%
size = 1000 # Number of Rows to read from file
list_conclusions = []
list_premises = []
number_prems_per_conc = [0] * (size - 1)
k = read_file(filepath=filepath)
sample = itertools.islice(k, size)
i = 0
fault_reads = 0 # counting errors while reading the rows
for val in sample:
if i > 0:
list_conclusions.append(val[1])
premises = 0
try:
sentence_clusterfuck = ast.literal_eval(val[4])
for sent in sentence_clusterfuck:
if("__PREMISE__" in sent['sent_id'] and len(sent['sent_text']) > 20):
list_premises.append(sent['sent_text'])
number_prems_per_conc[i - 1] += 1
except ValueError: # Some entries are not a readable data structure
fault_reads += 1
i += 1
print(f'read {size} rows from {filepath} with {fault_reads} fault reads')
# %%
# retrieve unique conclusion strings
list_conclusions_deduplicated = list(set(list_conclusions))
t = time.time()
# perform BERT embeddings
conclusions_embeddings = model.encode([sentence for sentence in list_conclusions])
print(time.time() - t)
t = time.time()
premises_embeddings = model.encode([prem for prem in list_premises])
print(time.time() - t)
# %%
cosine_scores = util.cos_sim(premises_embeddings[:], conclusions_embeddings[:]).flatten()
np_values = np.array(cosine_scores)
# %%
# matplotlib histogram
plt.hist(np_values, color='blue', edgecolor='black',
bins=100)
# Add labels
plt.title('')
plt.xlabel('Cos Sim')
plt.ylabel('Occurence')
plt.show()
# %%
# Density Plot
sns.displot(np_values, kind="kde")
plt.xlabel("COS Similarity")
plt.show()
# %%
# Plot Cumulative distribution function
sns.displot(np_values, kind="ecdf")
plt.xlabel("COS Similarity")
plt.show()
# %%
# Some distribution analysis
print("Distribution Information for all conclusion-premise COS Similarities")
print(f'Minimum Cos Similarity {min(np_values)}')
print(f'Max Similarity {max(np_values)}')
for i in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
calc = (len(np_values[np_values > i])) / len(np_values)
print(f'{"{0:,.2f}".format(calc*100)} % of conclusion-premise pairs have \
a COS Similarity greater than {i}')
print(f'Number of compared conclusion-premise pairs: {len(np_values)}')
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment