Skip to content
Snippets Groups Projects
Commit efb4db64 authored by Tinsaye Abye's avatar Tinsaye Abye
Browse files

expose draft

parent 622451ef
No related branches found
No related tags found
2 merge requests!12Release/1.0.0,!1Resolve "implement draft from Exposé"
package org.gradoop.famer.linking.similarityMeasuring.dataStructures;
import org.gradoop.famer.linking.similarityMeasuring.methods.SimilarityComputation;
import org.gradoop.famer.linking.similarityMeasuring.methods.TFIDF;
public class TFIDFSimilarityComponent extends SimilarityComponent {
/**
* Creates an instance of TFIDFSimilarityComponent
*
* @param baseConfig The base configuration for the similarity component
*/
public TFIDFSimilarityComponent(SimilarityComponentBaseConfig baseConfig) {
super(baseConfig);
}
@Override
public SimilarityComputation buildSimilarityComputation() throws Exception {
return new TFIDF();
}
}
package org.gradoop.famer.linking.similarityMeasuring.methods;
import org.gradoop.common.model.impl.properties.PropertyValue;
import javax.annotation.Nonnull;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
public class TFIDF implements SimilarityComputation<String> {
private static final String STRING_DELIMITER = " ";
@Override
public double computeSimilarity(String atr1, String atr2) throws Exception {
Objects.requireNonNull(atr1, "First attribute should not be null");
Objects.requireNonNull(atr2, "Second attribute should not be null");
final List<String> doc1Words = Arrays.asList(atr1.split(STRING_DELIMITER));
final List<String> doc2Words = Arrays.asList(atr2.split(STRING_DELIMITER));
final List<String> terms = extractTerms(doc1Words, doc2Words);
final Double[] doc1Vector =
terms.stream().map(term -> tfidf(term, doc1Words, doc2Words)).toArray(Double[]::new);
final Double[] doc2Vector =
terms.stream().map(term -> tfidf(term, doc2Words, doc1Words)).toArray(Double[]::new);
return computeCosineSimilarity(doc1Vector, doc2Vector);
}
@Override
public String parsePropertyValue(PropertyValue value) {
return getString(value);
}
/**
* @param term the term
* @param doc1 the first document
* @param doc2 the second document
* @return tfidf score
*/
private Double tfidf(String term, List<String> doc1, List<String> doc2) {
return tf(term, doc1) * idf(term, doc1, doc2);
}
@Nonnull
private Double tf(final String term, final List<String> doc) {
long count = doc.stream().filter(d -> d.equalsIgnoreCase(term)).count();
return (double) count;
}
/**
* Calculate the inverse document frequency after log(|D| / |{d:ti ∈ d}|)
*
* @param term the term
* @param doc1 the first document
* @param doc2 the second document
* @return the inverse document frequency
*/
@Nonnull
private Double idf(final String term, final List<String> doc1, final List<String> doc2) {
long df = 0;
if (doc1.contains(term)) {
df++;
}
if (doc2.contains(term)) {
df++;
}
if (df == 0) {
return 0D;
}
return Math.log(2 / (float) df);
}
/**
* Puts only unique terms from both docs to a single {@link ArrayList}
*
* @param doc1 the first document
* @param doc2 the second document
* @return duplicate free List
*/
public static List<String> extractTerms(final List<String> doc1, final List<String> doc2) {
final Set<String> result = new HashSet<>(doc1);
result.addAll(doc2);
return new ArrayList<>(result);
}
/**
* Computes the cosine similarity of two vectors
*
* @param vector1 the first vector
* @param vector2 the second vector
* @return the cosine similarity of the two vectors
*/
private double computeCosineSimilarity(final Double[] vector1, final Double[] vector2) {
double ab = 0;
for (int i = 0; i < vector1.length; i++) {
ab += vector1[i] * vector2[i];
}
double norm1 = computeVectorNorm(vector1);
double norm2 = computeVectorNorm(vector2);
return ab / (norm1 * norm2);
}
/**
* Computes the norm of a vector
*
* @param vector a vector
* @return the norm of this vector
*/
private double computeVectorNorm(final Double[] vector) {
double sum = 0;
for (Double aFloat : vector) {
sum += aFloat * aFloat;
}
return Math.sqrt(sum);
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment