Skip to content
Snippets Groups Projects
Commit 5f6e174e authored by Tinsaye Abye's avatar Tinsaye Abye
Browse files

Merge branch '4-performance-evaluation' into 'develop'

[feature] measure performance and save to a file

Closes #4

See merge request !4
parents fa6b46ac 40d49aa8
No related branches found
No related tags found
2 merge requests!12Release/1.0.0,!4[feature] measure performance and save to a file
.DS_Store
.idea
target
output
*.iml
.classpath
.project
......
import matplotlib.pyplot as plt
import pandas as pd
file = "tfidf-2020May-291539"
def plotData(data):
data.set_index("threshold", inplace=True)
data.plot()
plt.axes().yaxis.grid(True)
plt.show()
plt.close("all")
data = pd.read_csv(file + ".csv")\
[["threshold", "precision", "recall", "fMeasure", ]]
plotData(data)
threshold,precision,recall,fMeasure,allPositive,truePositive,perfectCompleteClusterNo
0.0,0.08976660682226212,0.07686395080707148,0.08281573498964803,1114,100,3475
0.05,0.10434782608695652,0.07378939277478862,0.0864475461503827,920,96,3669
0.1,0.10405257393209201,0.0730207532667179,0.08581752484191509,913,95,3676
0.15000000000000002,0.10702341137123746,0.07378939277478862,0.08735213830755231,897,96,3692
0.2,0.1080773606370876,0.0730207532667179,0.08715596330275228,879,95,3710
0.25,0.10941176470588235,0.07148347425057648,0.08647140864714088,850,93,3739
0.3,0.11432009626955475,0.0730207532667179,0.08911819887429642,831,95,3758
0.35,0.11428571428571428,0.07071483474250577,0.08736942070275404,805,92,3784
0.39999999999999997,0.11618798955613577,0.06840891621829362,0.08611514271891631,766,89,3823
0.44999999999999996,0.12309820193637622,0.06840891621829362,0.08794466403162056,723,89,3866
0.49999999999999994,0.12425149700598802,0.06379707916986933,0.08430675469781615,668,83,3921
0.5499999999999999,0.13036020583190394,0.05841660261337433,0.08067940552016985,583,76,4006
0.6,0.1447084233261339,0.05149884704073789,0.07596371882086167,463,67,4126
0.65,0.1585014409221902,0.042275172943889314,0.06674757281553398,347,55,4242
0.7000000000000001,0.16826923076923078,0.026902382782475018,0.04638833664678594,208,35,4381
0.7500000000000001,0.23275862068965517,0.0207532667179093,0.038108680310515175,116,27,4473
0.8000000000000002,0.29850746268656714,0.015372790161414296,0.029239766081871343,67,20,4522
0.8500000000000002,0.4,0.013835511145272867,0.02674591381872214,45,18,4544
0.9000000000000002,0.3235294117647059,0.008455034588777863,0.016479400749063667,34,11,4555
0.9500000000000003,0.19047619047619047,0.0030745580322828594,0.0060514372163388815,21,4,4568
1.0,0.16666666666666666,7.686395080707148E-4,0.0015302218821729152,6,1,4583
\ No newline at end of file
package org.gradoop.famer.linking;
import org.apache.commons.io.FileUtils;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
......@@ -40,8 +41,12 @@ import org.gradoop.famer.preprocessing.io.benchmarks.amazon.AmazonProductsReader
import org.gradoop.flink.model.impl.epgm.GraphCollection;
import org.gradoop.flink.model.impl.epgm.LogicalGraph;
import java.io.File;
import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
......@@ -76,11 +81,20 @@ public class Main {
amazonProductsReader.getBenchmarkDataAsGraphCollection(data);
File file = new File("output/tfidf-" + new SimpleDateFormat("yyyyMMM-ddHHmm'.csv'").format(new Date()));
FileUtils.writeStringToFile(file,
"threshold,precision,recall,fMeasure,allPositive,truePositive,perfectCompleteClusterNo");
/* Blocking */
BlockMaker blockMaker = getStandardPrefixBlockingComponent(sourceGraphLabel, targetGraphLabel);
BlockMaker blockMaker = getStandardPrefixBlockingComponent(sourceGraphLabel, targetGraphLabel, "title",
StandardBlockingEmptyKeyStrategy.ADD_TO_ALL);
final DataSet<EPGMVertex> vertices = benchmarkDataCollection.getVertices();
DataSet<Tuple2<EPGMVertex, EPGMVertex>> blockedVertices = blockMaker.execute(vertices);
blockedVertices = blockedVertices.union(
getStandardPrefixBlockingComponent(sourceGraphLabel, targetGraphLabel, "manufacturer",
StandardBlockingEmptyKeyStrategy.REMOVE).execute(vertices));
/* Build wordCountDict */
Map<String, Integer> wordsInDoc =
......@@ -102,45 +116,52 @@ public class Main {
.withBroadcastSet(getExecutionEnvironment().fromElements(wordsInDoc),
TFIDFSimilarityComponent.DOCUMENT_FREQUENCY_BROADCAST);
/* SELECTION */
final LinkerComponent linkerComponent = new LinkerComponent();
linkerComponent.setSelectionComponent(createSelectorComponent(1, 0.70));
DataSet<Tuple3<EPGMVertex, EPGMVertex, Double>> blockedVerticesSimilarityDegree =
blockedVerticesSimilarityFields.flatMap(new Selector(linkerComponent));
for (double threshold = 0; threshold <= 1; threshold = threshold + 0.05) {
/* SELECTION */
final LinkerComponent linkerComponent = new LinkerComponent();
linkerComponent.setSelectionComponent(createSelectorComponent(1, threshold));
DataSet<Tuple3<EPGMVertex, EPGMVertex, Double>> blockedVerticesSimilarityDegree =
blockedVerticesSimilarityFields.flatMap(new Selector(linkerComponent));
/* POST-PROCESSING */
DataSet<EPGMEdge> edges =
blockedVerticesSimilarityDegree.map(new LinkMaker(new EPGMEdgeFactory(), SIM_DEGREE_EDGE_PROPERTY));
/* POST-PROCESSING */
DataSet<EPGMEdge> edges =
blockedVerticesSimilarityDegree.map(new LinkMaker(new EPGMEdgeFactory(), SIM_DEGREE_EDGE_PROPERTY));
final LogicalGraph linkingResult =
benchmarkDataCollection.getConfig().getLogicalGraphFactory().fromDataSets(vertices, edges);
final LogicalGraph linkingResult =
benchmarkDataCollection.getConfig().getLogicalGraphFactory().fromDataSets(vertices, edges);
/* CLUSTERING */
final CLIPConfig clipConfig = new CLIPConfig(0, 2, true, 0.5, 0.2, 0.3);
final LogicalGraph graphCollection =
new CLIP(clipConfig, ClusteringOutputType.GRAPH_COLLECTION, Integer.MAX_VALUE).execute(linkingResult);
/* CLUSTERING */
final CLIPConfig clipConfig = new CLIPConfig(0, 2, true, 0.5, 0.2, 0.3);
final LogicalGraph graphCollection =
new CLIP(clipConfig, ClusteringOutputType.GRAPH_COLLECTION, Integer.MAX_VALUE).execute(linkingResult);
/* Performance Measurement */
final ClusteringQualityMeasures qualityMeasures =
new ClusteringQualityMeasures(graphCollection, data + "/PerfectMapping.csv", ",", "id", true, true,
false, 2);
/* Performance Measurement */
final ClusteringQualityMeasures qualityMeasures =
new ClusteringQualityMeasures(graphCollection, data + "/PerfectMapping.csv", ",", "id", true, true,
false, 2);
final String result = String
.format("%s,%s,%s,%s,%s,%s,%s", threshold, qualityMeasures.computePrecision(),
qualityMeasures.computeRecall(), qualityMeasures.computeFMeasure(),
qualityMeasures.getAllPositives(), qualityMeasures.getTruePositives(),
qualityMeasures.getClusterNo());
System.out.println("Recall " + qualityMeasures.computeRecall());
System.out.println("Precision = " + qualityMeasures.computePrecision());
System.out.println("FMeasure = " + qualityMeasures.computeFMeasure());
System.out.println("AllPositives = " + qualityMeasures.getAllPositives());
System.out.println("MaxClusterSize = " + qualityMeasures.getMaxClusterSize());
System.out.println("AverageClusterSize = " + qualityMeasures.getAverageClusterSize());
System.out.println("TruePositives = " + qualityMeasures.getTruePositives());
System.out.println("PerfectCompleteClusterNo = " + qualityMeasures.getPerfectCompleteClusterNo());
FileUtils.writeStringToFile(file, "\n" + result, StandardCharsets.UTF_8, true);
}
}
private static BlockMaker getStandardPrefixBlockingComponent(String sourceGraphLabel,
String targetGraphLabel) {
KeyGeneratorComponent keyGeneratorComponent = new PrefixLengthComponent("title", 1);
String targetGraphLabel, String attribute, StandardBlockingEmptyKeyStrategy emptyKeyStrategy) {
KeyGeneratorComponent keyGeneratorComponent = new PrefixLengthComponent(attribute, 1);
return getStandardBlockingComponent(sourceGraphLabel, targetGraphLabel, keyGeneratorComponent,
emptyKeyStrategy);
}
private static BlockMaker getStandardBlockingComponent(String sourceGraphLabel, String targetGraphLabel,
KeyGeneratorComponent keyGeneratorComponent, StandardBlockingEmptyKeyStrategy emptyKeyStrategy) {
BlockingKeyGenerator blockingKeyGenerator = new BlockingKeyGenerator(keyGeneratorComponent);
Map<String, Set<String>> graphPairs = new HashMap<>();
// g1 is limited to g2
......@@ -158,7 +179,7 @@ public class Main {
BlockingComponentBaseConfig blockingBaseConfig =
new BlockingComponentBaseConfig(blockingKeyGenerator, graphPairs, categoryPairs);
BlockingComponent blockingComponent =
new StandardBlockingComponent(blockingBaseConfig, 12, StandardBlockingEmptyKeyStrategy.ADD_TO_ALL);
new StandardBlockingComponent(blockingBaseConfig, 12, emptyKeyStrategy);
return new BlockMaker(blockingComponent);
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment