Commit cf9de91c authored by Jerome Wuerf's avatar Jerome Wuerf
Browse files

Add paper

parent 690662ca
......@@ -286,3 +286,5 @@ TSWLatexianTemp*
# option is specified. Footnotes are the stored in a file with suffix Notes.bib.
# Uncomment the next line to have this generated file ignored.
\ No newline at end of file
image: tianon/latex
- cd paper && latexmk -synctex=1 -interaction=nonstopmode -file-line-error -pdf -outdir=../out paper
- "./out/*.pdf"
# Paper
Latest version of the paper can be found [here](
\ No newline at end of file
% Journals
% First the Full Name is given, then the abbreviation used in the AMS Math
% Reviews, with an indication if it could not be found there.
% Note the 2nd overwrites the 1st, so swap them if you want the full name.
@String{AMSTrans = "American Mathematical Society Translations" }
@String{AMSTrans = "Amer. Math. Soc. Transl." }
@String{BullAMS = "Bulletin of the American Mathematical Society" }
@String{BullAMS = "Bull. Amer. Math. Soc." }
@String{ProcAMS = "Proceedings of the American Mathematical Society" }
@String{ProcAMS = "Proc. Amer. Math. Soc." }
@String{TransAMS = "Transactions of the American Mathematical Society" }
@String{TransAMS = "Trans. Amer. Math. Soc." }
@String{CACM = "Communications of the {ACM}" }
@String{CACM = "Commun. {ACM}" }
@String{CompServ = "Comput. Surveys" }
@String{JACM = "J. ACM" }
@String{ACMMathSoft = "{ACM} Transactions on Mathematical Software" }
@String{ACMMathSoft = "{ACM} Trans. Math. Software" }
@String{SIGNUM = "{ACM} {SIGNUM} Newsletter" }
@String{SIGNUM = "{ACM} {SIGNUM} Newslett." }
@String{AmerSocio = "American Journal of Sociology" }
@String{AmerStatAssoc = "Journal of the American Statistical Association" }
@String{AmerStatAssoc = "J. Amer. Statist. Assoc." }
@String{ApplMathComp = "Applied Mathematics and Computation" }
@String{ApplMathComp = "Appl. Math. Comput." }
@String{AmerMathMonthly = "American Mathematical Monthly" }
@String{AmerMathMonthly = "Amer. Math. Monthly" }
@String{BIT = "{BIT}" }
@String{BritStatPsych = "British Journal of Mathematical and Statistical
Psychology" }
@String{BritStatPsych = "Brit. J. Math. Statist. Psych." }
@String{CanMathBull = "Canadian Mathematical Bulletin" }
@String{CanMathBull = "Canad. Math. Bull." }
@String{CompApplMath = "Journal of Computational and Applied Mathematics" }
@String{CompApplMath = "J. Comput. Appl. Math." }
@String{CompPhys = "Journal of Computational Physics" }
@String{CompPhys = "J. Comput. Phys." }
@String{CompStruct = "Computers and Structures" }
@String{CompStruct = "Comput. \& Structures" }
@String{CompJour = "The Computer Journal" }
@String{CompJour = "Comput. J." }
@String{CompSysSci = "Journal of Computer and System Sciences" }
@String{CompSysSci = "J. Comput. System Sci." }
@String{Computing = "Computing" }
@String{ContempMath = "Contemporary Mathematics" }
@String{ContempMath = "Contemp. Math." }
@String{Crelle = "Crelle's Journal" }
@String{GiornaleMath = "Giornale di Mathematiche" }
@String{GiornaleMath = "Giorn. Mat." } % didn't find in AMS MR., ibid.
@String{Computer = "{IEEE} Computer" }
@String{IEEETransComp = "{IEEE} Transactions on Computers" }
@String{IEEETransComp = "{IEEE} Trans. Comput." }
@String{IEEETransAC = "{IEEE} Transactions on Automatic Control" }
@String{IEEETransAC = "{IEEE} Trans. Automat. Control" }
@String{IEEESpec = "{IEEE} Spectrum" } % didn't find in AMS MR
@String{ProcIEEE = "Proceedings of the {IEEE}" }
@String{ProcIEEE = "Proc. {IEEE}" } % didn't find in AMS MR
@String{IEEETransAeroElec = "{IEEE} Transactions on Aerospace and Electronic
Systems" }
@String{IEEETransAeroElec = "{IEEE} Trans. Aerospace Electron. Systems" }
@String{IMANumerAna = "{IMA} Journal of Numerical Analysis" }
@String{IMANumerAna = "{IMA} J. Numer. Anal." }
@String{InfProcLet = "Information Processing Letters" }
@String{InfProcLet = "Inform. Process. Lett." }
@String{InstMathApp = "Journal of the Institute of Mathematics and
its Applications" }
@String{InstMathApp = "J. Inst. Math. Appl." }
@String{IntControl = "International Journal of Control" }
@String{IntControl = "Internat. J. Control" }
@String{IntNumerEng = "International Journal for Numerical Methods in
Engineering" }
@String{IntNumerEng = "Internat. J. Numer. Methods Engrg." }
@String{IntSuper = "International Journal of Supercomputing Applications" }
@String{IntSuper = "Internat. J. Supercomputing Applic." } % didn't find
%% in AMS MR
@String{Kibernetika = "Kibernetika" }
@String{JResNatBurStand = "Journal of Research of the National Bureau
of Standards" }
@String{JResNatBurStand = "J. Res. Nat. Bur. Standards" }
@String{LinAlgApp = "Linear Algebra and its Applications" }
@String{LinAlgApp = "Linear Algebra Appl." }
@String{MathAnaAppl = "Journal of Mathematical Analysis and Applications" }
@String{MathAnaAppl = "J. Math. Anal. Appl." }
@String{MathAnnalen = "Mathematische Annalen" }
@String{MathAnnalen = "Math. Ann." }
@String{MathPhys = "Journal of Mathematical Physics" }
@String{MathPhys = "J. Math. Phys." }
@String{MathComp = "Mathematics of Computation" }
@String{MathComp = "Math. Comp." }
@String{MathScand = "Mathematica Scandinavica" }
@String{MathScand = "Math. Scand." }
@String{TablesAidsComp = "Mathematical Tables and Other Aids to Computation" }
@String{TablesAidsComp = "Math. Tables Aids Comput." }
@String{NumerMath = "Numerische Mathematik" }
@String{NumerMath = "Numer. Math." }
@String{PacificMath = "Pacific Journal of Mathematics" }
@String{PacificMath = "Pacific J. Math." }
@String{ParDistComp = "Journal of Parallel and Distributed Computing" }
@String{ParDistComp = "J. Parallel and Distrib. Comput." } % didn't find
%% in AMS MR
@String{ParComputing = "Parallel Computing" }
@String{ParComputing = "Parallel Comput." }
@String{PhilMag = "Philosophical Magazine" }
@String{PhilMag = "Philos. Mag." }
@String{ProcNAS = "Proceedings of the National Academy of Sciences
of the USA" }
@String{ProcNAS = "Proc. Nat. Acad. Sci. U. S. A." }
@String{Psychometrika = "Psychometrika" }
@String{QuartMath = "Quarterly Journal of Mathematics, Oxford, Series (2)" }
@String{QuartMath = "Quart. J. Math. Oxford Ser. (2)" }
@String{QuartApplMath = "Quarterly of Applied Mathematics" }
@String{QuartApplMath = "Quart. Appl. Math." }
@String{RevueInstStat = "Review of the International Statisical Institute" }
@String{RevueInstStat = "Rev. Inst. Internat. Statist." }
@String{JSIAM = "Journal of the Society for Industrial and Applied
Mathematics" }
@String{JSIAM = "J. Soc. Indust. Appl. Math." }
@String{JSIAMB = "Journal of the Society for Industrial and Applied
Mathematics, Series B, Numerical Analysis" }
@String{JSIAMB = "J. Soc. Indust. Appl. Math. Ser. B Numer. Anal." }
@String{SIAMAlgMeth = "{SIAM} Journal on Algebraic and Discrete Methods" }
@String{SIAMAlgMeth = "{SIAM} J. Algebraic Discrete Methods" }
@String{SIAMAppMath = "{SIAM} Journal on Applied Mathematics" }
@String{SIAMAppMath = "{SIAM} J. Appl. Math." }
@String{SIAMComp = "{SIAM} Journal on Computing" }
@String{SIAMComp = "{SIAM} J. Comput." }
@String{SIAMMatrix = "{SIAM} Journal on Matrix Analysis and Applications" }
@String{SIAMMatrix = "{SIAM} J. Matrix Anal. Appl." }
@String{SIAMNumAnal = "{SIAM} Journal on Numerical Analysis" }
@String{SIAMNumAnal = "{SIAM} J. Numer. Anal." }
@String{SIAMReview = "{SIAM} Review" }
@String{SIAMReview = "{SIAM} Rev." }
@String{SIAMSciStat = "{SIAM} Journal on Scientific and Statistical
Computing" }
@String{SIAMSciStat = "{SIAM} J. Sci. Statist. Comput." }
@String{SoftPracExp = "Software Practice and Experience" }
@String{SoftPracExp = "Software Prac. Experience" } % didn't find in AMS MR
@String{StatScience = "Statistical Science" }
@String{StatScience = "Statist. Sci." }
@String{Techno = "Technometrics" }
@String{USSRCompMathPhys = "{USSR} Computational Mathematics and Mathematical
Physics" }
@String{USSRCompMathPhys = "{U. S. S. R.} Comput. Math. and Math. Phys." }
@String{VLSICompSys = "Journal of {VLSI} and Computer Systems" }
@String{VLSICompSys = "J. {VLSI} Comput. Syst." }
@String{ZAngewMathMech = "Zeitschrift fur Angewandte Mathematik und
Mechanik" }
@String{ZAngewMathMech = "Z. Angew. Math. Mech." }
@String{ZAngewMathPhys = "Zeitschrift fur Angewandte Mathematik und Physik" }
@String{ZAngewMathPhys = "Z. Angew. Math. Phys." }
% Publishers % ================================================= |
@String{Academic = "Academic Press" }
@String{ACMPress = "{ACM} Press" }
@String{AdamHilger = "Adam Hilger" }
@String{AddisonWesley = "Addison-Wesley" }
@String{AllynBacon = "Allyn and Bacon" }
@String{AMS = "American Mathematical Society" }
@String{Birkhauser = "Birkha{\"u}ser" }
@String{CambridgePress = "Cambridge University Press" }
@String{Chelsea = "Chelsea" }
@String{ClaredonPress = "Claredon Press" }
@String{DoverPub = "Dover Publications" }
@String{Eyolles = "Eyolles" }
@String{HoltRinehartWinston = "Holt, Rinehart and Winston" }
@String{Interscience = "Interscience" }
@String{JohnsHopkinsPress = "The Johns Hopkins University Press" }
@String{JohnWileySons = "John Wiley and Sons" }
@String{Macmillan = "Macmillan" }
@String{MathWorks = "The Math Works Inc." }
@String{McGrawHill = "McGraw-Hill" }
@String{NatBurStd = "National Bureau of Standards" }
@String{NorthHolland = "North-Holland" }
@String{OxfordPress = "Oxford University Press" } %address Oxford or London?
@String{PergamonPress = "Pergamon Press" }
@String{PlenumPress = "Plenum Press" }
@String{PrenticeHall = "Prentice-Hall" }
@String{SIAMPub = "{SIAM} Publications" }
@String{Springer = "Springer-Verlag" }
@String{TexasPress = "University of Texas Press" }
@String{VanNostrand = "Van Nostrand" }
@String{WHFreeman = "W. H. Freeman and Co." }
author = {Carbonell, Jaime and Stewart, Jade},
year = {1999},
month = {06},
pages = {},
title = {The Use of MMR, Diversity-Based Reranking for Reordering Documents and Producing Summaries},
journal = {SIGIR Forum (ACM Special Interest Group on Information Retrieval)},
doi = {10.1145/290941.291025}
title = "Towards an argumentative content search engine using weak supervision",
author = "Levy, Ran and
Bogin, Ben and
Gretz, Shai and
Aharonov, Ranit and
Slonim, Noam",
booktitle = "Proceedings of the 27th International Conference on Computational Linguistics",
month = aug,
year = "2018",
address = "Santa Fe, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "",
pages = "2066--2081",
title={MiniLM: Deep Self-Attention Distillation for Task-Agnostic Compression of Pre-Trained Transformers},
author={Wenhui Wang and Furu Wei and Li Dong and Hangbo Bao and Nan Yang and Ming Zhou},
title = "{A}rgumen{T}ext: Searching for Arguments in Heterogeneous Sources",
author = "Stab, Christian and
Daxenberger, Johannes and
Stahlhut, Chris and
Miller, Tristan and
Schiller, Benjamin and
Tauchmann, Christopher and
Eger, Steffen and
Gurevych, Iryna",
booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Demonstrations",
month = jun,
year = "2018",
address = "New Orleans, Louisiana",
publisher = "Association for Computational Linguistics",
url = "",
doi = "10.18653/v1/N18-5005",
pages = "21--25",
author = {Wachsmuth, Henning and Stein, Benno and Ajjour, Yamen},
year = {2017},
month = {01},
pages = {1117-1127},
title = {"PageRank" for Argument Relevance},
doi = {10.18653/v1/E17-1105}
title={Conspiracy theories and their societal effects during the COVID-19 pandemic},
author={Pummerer, Lotte and B{\"o}hm, Robert and Lilleholt, Lau and Winter, Kevin and Zettler, Ingo and Sassenberg, Kai},
journal={Social Psychological and Personality Science},
publisher={SAGE Publications Sage CA: Los Angeles, CA}
address = {Berlin Heidelberg New York},
author = {Alexander Bondarenko and Lukas Gienapp and Maik Fr{\"o}be and Meriem Beloucif and Yamen Ajjour and Alexander Panchenko and Chris Biemann and Benno Stein and Henning Wachsmuth and Martin Potthast and Matthias Hagen},
booktitle = {Experimental IR Meets Multilinguality, Multimodality, and Interaction. 12th International Conference of the CLEF Association (CLEF 2021)},
doi = {10.1007/978-3-030-85251-1\_28},
editor = {{K. Sel{\c{c}}uk} Candan and Bogdan Ionescu and Lorraine Goeuriot and Henning M{\"u}ller and Alexis Joly and Maria Maistro and Florina Piroi and Guglielmo Faggioli and Nicola Ferro},
ids = {},
month = sep,
pages = {450-467},
publisher = {Springer},
series = {Lecture Notes in Computer Science},
site = {Bucharest, Romania},
title = {{Overview of Touch{\'e} 2021: Argument Retrieval}},
url = {},
volume = 12880,
year = 2021,
title = "Building an Argument Search Engine for the Web",
author = "Wachsmuth, Henning and
Potthast, Martin and
Al-Khatib, Khalid and
Ajjour, Yamen and
Puschmann, Jana and
Qu, Jiani and
Dorsch, Jonas and
Morari, Viorel and
Bevendorff, Janek and
Stein, Benno",
booktitle = "Proceedings of the 4th Workshop on Argument Mining",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "",
doi = "10.18653/v1/W17-5106",
pages = "49--59",
author = {Ajjour, Yamen and Wachsmuth, Henning and Kiesel, Johannes and Potthast, Martin and Hagen, Matthias and Stein, Benno},
year = {2019},
month = {08},
pages = {48-59},
title = {Data Acquisition for Argument Search: The Corpus},
isbn = {978-3-030-30178-1},
doi = {10.1007/978-3-030-30179-8_4}
author = {Kusner, Matt J. and Sun, Yu and Kolkin, Nicholas I. and Weinberger, Kilian Q.},
title = {From Word Embeddings to Document Distances},
year = {2015},
publisher = {},
abstract = {We present the Word Mover's Distance (WMD), a novel distance function between text documents. Our work is based on recent results in word embeddings that learn semantically meaningful representations for words from local cooccurrences in sentences. The WMD distance measures the dissimilarity between two text documents as the minimum amount of distance that the embedded words of one document need to "travel" to reach the embedded words of another document. We show that this distance metric can be cast as an instance of the Earth Mover's Distance, a well studied transportation problem for which several highly efficient solvers have been developed. Our metric has no hyperparameters and is straight-forward to implement. Further, we demonstrate on eight real world document classification data sets, in comparison with seven state-of-the-art baselines, that the WMD metric leads to unprecedented low k-nearest neighbor document classification error rates.},
booktitle = {Proceedings of the 32nd International Conference on International Conference on Machine Learning - Volume 37},
pages = {957–966},
numpages = {10},
location = {Lille, France},
series = {ICML'15}
address = {Berlin Heidelberg New York},
author = {Alexander Bondarenko and Matthias Hagen and Martin Potthast and Henning Wachsmuth and Meriem Beloucif and Chris Biemann and Alexander Panchenko and Benno Stein},
booktitle = {Advances in Information Retrieval. 42nd European Conference on IR Research (ECIR 2020)},
doi = {10.1007/978-3-030-45442-5\_67},
editor = {Pablo Castells and Nicola Ferro and {Joemon M.} Jose and Jo{\~a}o Magalh{\~a}es and {M{\'a}rio J.} Silva and Emine Yilmaz},
ids = {potthast:2020d,stein:2020c},
month = apr,
pages = {517-523},
publisher = {Springer},
series = {Lecture Notes in Computer Science},
site = {Lisbon, Portugal},
title = {{Touch\'{e}: First Shared Task on Argument Retrieval}},
url = {},
volume = 12036,
year = 2020
title={A study of smoothing methods for language models applied to ad hoc information retrieval},
author={Zhai, Chengxiang and Lafferty, John},
booktitle={ACM SIGIR Forum},
organization={ACM New York, NY, USA}
author = {Aric A. Hagberg and Daniel A. Schult and Pieter J. Swart},
title = {Exploring Network Structure, Dynamics, and Function using NetworkX},
booktitle = {Proceedings of the 7th Python in Science Conference},
pages = {11 - 15},
address = {Pasadena, CA USA},
year = {2008},
editor = {Ga\"el Varoquaux and Travis Vaught and Jarrod Millman},
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "",
@book{Walton, place={Cambridge}, title={Argumentation Schemes}, DOI={10.1017/CBO9780511802034}, publisher={Cambridge University Press}, author={Walton, Douglas and Reed, Christopher and Macagno, Fabrizio}, year={2008}}
title={Efficient estimation of word representations in vector space},
author={Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey},
journal={arXiv preprint arXiv:1301.3781},
address = {Berlin Heidelberg New York},
author = {Alexander Bondarenko and Maik Fr{\"o}be and Johannes Kiesel and Shahbaz Syed and Timon Gurcke and Meriem Beloucif and Alexander Panchenko and Chris Biemann and Benno Stein and Henning Wachsmuth and Martin Potthast and Matthias Hagen},
booktitle = {Advances in Information Retrieval. 44th European Conference on IR Research (ECIR 2022)},
editor = {Matthias Hagen and Suzan Verberne and Craig Macdonald and Christin Seifert and Krisztian Balog and Kjetil N{\o}rv\r{a}g and Vinay Setty},
month = apr,
publisher = {Springer},
series = {Lecture Notes in Computer Science},
site = {Stavanger, Norway},
title = {{Overview of Touch{\'e} 2022: Argument Retrieval}},
todo = {pages, doi, url, videourl, slides, volume},
year = 2022
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
Every day search engines are fact-driven. They try to serve the user's information needs as quickly as possible. This strategy could hinder a deep engagement with controversial topics thus there is a need for approaches that present a more diverse set of information to support opinion formation. This work aims to examine different reranking approaches using the preprocessed corpus in order to solve the Touché 2022 Argument Retrieval Task. The proposed retrieval system relies on an initial retrieval using a semantic search on a sentence level and leverages heuristics based on first principles. Our reranking approaches incorporate \textit{maximal marginal relevance}, \textit{word mover distance}, and a novel approach, based on a fuzzy matching on part of speech tags, that we call \textit{structural distance}. Further, we explore the applicability of a graph-based reranking approach. The results indicate that the reranking approaches improve argument quality to varying degrees at the cost of relevance. \textit{Structural distance} performs best with minimal loss in relevance and the most significant gain in terms of quality.
%Furthermore, limitations of the applicability of a graph-based approach for our retrieval system were explored.
%The experiments are evaluated regarding the relevance and quality of the retrieved results and show the benefits of the approaches maximum marginal relevance, word mover distance and structural distance and furthermore describe the problems that arise with the graph based approach.
\ No newline at end of file
The enduring protests to the pandemic restrictions seem to uncover an imminent problem in the current discussion culture. Despite increased exposure to data and information through our daily lives, we fail to present the gained knowledge to enable debates and support individuals' opinion formation. Regarding COVID-19, it is shown that people exposed to misinformation, biased media, and conspiracy have lower trust in democratic institutions \cite{pummerer2022conspiracy}. This situation makes it urgent for societies to confront misinformed individuals with reasonable arguments. Besides COVID-19, web resources, like blogs and news sites, address many other topics with a similar impact in an accelerating fashion, creating the need for automatic retrieval of reasonable arguments.
This work, describes our submission for Task 1 of Touché 2022 \cite{bondarenko:2022c}. The task asks participants to create an argument retrieval system for a given corpus to support the opinion formation on controversial societal topics. In this year's version of the first task, the requirements for the final systems differ from the previous years. The submitted retrieval systems should output pairs of reasonable argumentative sentences for a given topic. An argument is reasonable if the retrieved sentences are relevant and qualitative. The quality of arguments is defined by (1) the argumentativeness of each sentence, (2) coherence between the sentences, and (3) together the sentences of the pair should form a summary of their originating arguments \cite{bondarenko:2022c}.
Our proposed system consists of three main components: indexing, initial retrieval, and reranking. The system's source code is publicly available\footnote{}. Before indexing, sentences of the provided preprocessed \cite{argsme2} are transformed into vector embeddings. Sentences and vector embeddings are persisted into two indices, one for premises and one for conclusions. We effectively conduct a nearest neighbor search in the embedding space at retrieval time. The search utilizes the cosine similarity between query embedding and the embeddings in the respective index. This approach maximizes the semantic similarity between sentences, which should reap relevant sentences. In the following, we will refer to this as \textit{semantic search}. Finally, we compare multiple reranking approaches that aim to balance relevance and diversification of query results by assessing differences between a query and the retrieved sentences. Having outlined our initial motivation and a rough system overview of how we approach the given task, we pose the following research question:
\textit{Do simple, argument agnostic reranking approaches improve argument quality compared to an initial semantic search?}
To answer our research question, we conducted experiments with three different reranking approaches utilizing \textit{maximal marginal relevance} (MMR), \textit{structural distance} (SD), and \textit{word mover's distance} (WMD). All three reranking approaches increase argument quality while sacrificing argument relevance. Further, we explored a graph-based argument reranking approach, which we did not fully develop due to challenges with our own setup and the structure of the processed dataset. Nevertheless, we would like to share our insights with the research community. Section 2 will introduce the reranking methods in detail. Following the related work, we describe our system and reranking approaches in detail. Section 4 presents the evaluation of our experiments, which are discussed in section 5.
% /\_____/\
% / o o \
% ( == ^ == )
% ) (
% ( Jerome )
% ( ( ) ( ) )
% (__(__)___(__)__)
\section{Related Work} %TO DO 2 Maya
This section introduces the challenge of argument retrieval and describes existing reranking approaches. We pick up on the shortcomings of previous studies to justify the design of our system.
\subsection{Challenges in argument retrieval}
Retrieving arguments for controversial topics requires a search system that anticipates the underlying information needs of users. Users want to find supportive and opposing arguments on the respective topics quickly and comprehensibly. Argument Search denotes a relatively new field of research dedicated to retrieving arguments on controversial topics. It unites challenges of natural language processing and information retrieval while opening up a broad range of research opportunities for computational argumentation \cite{building_arg_2017}. ArgumenText \cite{argtext}, the IBM Debater \cite{ibm} and \cite{building_arg_2017, argsme2} are relevant pioneers in that field of research offering diverse technical approaches to the challenges of argument retrieval.
The argument corpus of \cite{argsme2} used in the Touché 2022 Task 1 consists of arguments crawled from the debate portals,,, and \cite{argsme2}. Arguments in the corpus are composed of a conclusion paired with a set of premises following the argumentation scheme proposed by Walton \cite{Walton} and others. Each premise has a supportive or opposing stance towards a conclusion.
\subsection{Existing methods in argument retrieval}
Previous years of Touché showed substantial improvements in retrieval scores. In the first year, multiple submissions indicated that the DirichletLM \cite{zhai2017study} retrieval model is a strong baseline for the initial retrieval of whole arguments \cite{bondarenko:2020b}. Additionally, query expansion mechanisms were deployed to increase recall. Submissions for the second round of Touché indicated that argument-aware reranking approaches using fine-tuned language models improved previous years' results. Further, approaches focused on parameter tuning using existing relevance judgments \cite{bondarenko:2021d}.
Up to now, only a minority of submissions leveraged semantic embeddings for an initial retrieval, which motivates us to use a semantic search. Further, we try to mimic query expansion mechanisms by querying premises with multiple conclusions on a controversial topic. Finally, our reranking distinguishes us from existing approaches, as we do not rely on argument-specific domain features or any learned methods.
\ No newline at end of file
\section{Methodological Approach}
The architecture of our proposed retrieval system (Figure \ref{fig:systemarchitecture}) consists of indexing, retrieval, and a reranking module. The system relies on two Elasticsearch\footnote{} indices, one for conclusions and one for premises. Initially, our system uses the preprocessed\cite{argsme2} corpus, which holds arguments divided into their constituent premise and conclusion sentences. The sentences are transformed into vector embeddings (Section \ref{sec:preprocessing}). Premises and conclusions are indexed into the respective indices with their vector embeddings. While indexing, the standard tokenization pipeline of Elasticsearch is applied to save the number of tokens in a sentence as further metadata.
Using a semantic search, the retrieval module generates an initial ranking for premise and conclusion pairs. First, it queries the conclusion for a given controversial topic. Next, each conclusion serves as a query for the premise index. The initial ranking scores are based on the cosine similarity between the vector embeddings of a query and the indexed sentences, thus mimicking the nearest neighbor search in the embedding space. Additionally, we introduce the hard constraint that a retrieved sentence must have at least 1.75 times the number of tokens of the query. In the following, we will refer to this constraint as \textit{token factor}. By convention, we retrieve 100 conclusions and 50 premises per conclusion. A primary motivation behind this two-step retrieval is an expected increase in premises recall, as we query the premise index multiple times using differing conclusions.
Finally, the reranking module scores conclusions and the premises separately using three different methods (Section \ref{sec:reranking}). In general, these methods improve the quality of the retrieved sentences by calculating new ranking scores between the query and initially ranked sentences. Lastly, our system generates a text file in the TREC format. When writing the output file, we enforce on a topic level that there are no duplicates in the retrieved premises, and premises must match the stance of a conclusion.
\caption{Argument retrieval system architecture}
The organizers of the shared task provide preprocessed \cite{argsme2} dataset that contains the constituent sentences of each argument of the original dataset. Further, it contains context meta-data for each argument and the stance for each sentence. Initially, we transform the provided preprocessed dataset into a structured parquet file. One row of this flat file corresponds to one sentence. One row holds information about the argument id, sentence number, stance towards a topic, sentence text, and the sentence type, either conclusion or premise. The flat file contains 6,123,792 sentences that split into 338,595 conclusions and 5,785,197 premises.
Next, we deduplicate the sentences using an exact string match. For the conclusions, we count 328,474 duplicates with 54,512 unique ones, and for the premises, we count 770,876 duplicates with 273,593 unique duplicates. Interestingly, the majority distribution of stances differs within the unique premises and conclusion duplicates. Of the 54,512 unique conclusion duplicates, 36,596 have a contra stance, whereas 17,916 have a supportive stance. The high number of duplicates of conclusions arises from the parsing of debate platforms. Conclusions are often simply the headline of a post on a controversial topic. A post contains multiple arguments. The duplicated premises arise from the citations between different posts.
As our systems impose a \textit{token factor} as a constraint for retrieved sentences, we analyzed the distribution of the number of tokens for all conclusions and observed an average amount of tokens of 32.49 (Figure \ref{fig:tokensconclusions}). Concerning the premises, we observed a much higher average total amount of tokens of 1877.88 (Figure \ref{fig:tokenspremises}). As a final preprocessing step, each sentence is encoded into a vector embedding via an out-of-the-box MiniLM \cite{wang2020minilm} language model\footnote{} utilizing the sentence transformers library \cite{reimers-2019-sentence-bert}.
\captionof{figure}{Distribution of tokens per \\ conclusions.}
\captionof{figure}{Distribution of tokens per \\ all premises.}
%Viele Zitate in anderen Argumenten, die damit gar nichts zu tun haben -> viele Duplikate und schwierig premises conclusions zuzuordnen für graph, weil inflated?
\subsection{Reranking approaches}
To improve the argument quality of our initial retrieval, we examine three different argument agnostic reranking approaches using existing implementations. Due to the two-step retrieval approach of our system, reranking scores of conclusions and premises are calculated separately. First, we rerank the conclusions, then each set of premises. Each approach combines the respective reranking score with the initial ranking score using a weighted sum (Section \ref{sec:finalScoring}). We hope that this general approach improves the argumentative quality by simply ensuring that the top results differ from the original query, which should have, due to the semantic search, returned relevant documents. Furthermore, we explore the limitations of our system regarding the use of a graph-based argument relevance for reranking.
\subsubsection{Maximum Marginal Relevance}
Besides a document's relevance to the query, whether a document provides new information compared to its predecessors is an essential factor. Simply maximizing the relevance to the query is an appropriate choice if there are only a few relevant documents. However, duplicate information is inevitable with an abundance of relevant documents, and reducing this redundancy becomes increasingly essential. \citeauthor{mmr}\cite{mmr} proposed the MMR that linearly combines query relevance with information novelty, thus ensuring high document relevance with minimal similarity to previously chosen ones. The tradeoff between query relevance and information novelty is controlled by a parameter \(\lambda\). For our experiments, we assess different values of \(\lambda\).
% \begin{equation}
% \text{MMR} := \arg \max_{D_i \in R \setminus S} \left[\lambda (\text{Sim}_1 (D_i, Q) - (1 - \lambda)\max_{D_j \in S} \text{Sim}_2(D_i, D_j))\right]
% \label{eqn:mmr}
% \end{equation}
% where \(Q\) is the query, \(R\) is the ranked list of documents retrieved by an information-retrieval system given a document collection \(C\), \(Q\) and a threshold $\theta$, \(S\) is the subset of documents in \(R\) already selected, \(R\setminus S\) is the set of not yet selected documents, and Sim$_1$ and Sim$_2$ are similarity metrics.
% For the reranking approach using MMR, the calculation of the marginal relevance according to equation \ref{eqn:mmr} is performed for the retrieved sentence pairs as follows: For a specific conclusion \(c\) of a sentence pair the marginal relevance towards all other available conclusions and the respective topic is calculated. Furthermore, the marginal relevance of a premise towards all other available premises and the respective conclusion is calculated. As a result we obtain a marginal relevance value for the premise and the conclusion for each sentence pair.
\subsubsection{Structural distance}
As a second approach, we propose a reranking based on the \textit{structural distance} between query and retrieved sentences. We define the structure of a sentence as a list of part-of-speech tags generated by an out-of-the-box language model of the spacy NLP library \footnote{}. Using the part of speech tags of a query and a retrieved sentence, we calculate the Jaro similarity on a tag instead of a character level. The standard Jaro similarity uses the length of each string, the number of matching characters, and the number of transposed characters between both in a specific interval. Our implementation changes these to the number of part-of-speech tags, the number of matching, and the number of transposed tags. This approach allows for fuzzy matching on the structure between two sentences. Finally, we convert the gained similarity into a distance score. This reranking score imposes a penalty on retrieved sentences that merely rephrase the search query by synonyms.
\subsubsection{Word mover's distance}
In contrast to other distances that examine whole sentences strings, the \textit{word mover's distance}, proposed by \citeauthor{wmd}\cite{wmd}, considers the similarity of single words to each other. For each word pair, the earth mover's distance of the corresponding words is calculated using their Word2Vec \cite{mikolov2013efficient} embeddings. This process is formulated as a combinatorial problem to retrieve the word pairs leading to a minimal cumulative sum of distances of all constructed word pairs. Hence, it accounts for sentences with no words in common but similar meanings due to synonymy. Our reranking tries to leverage this behavior to rank sentences different from the query higher. To lower the computational cost, our implementation uses the \text{wmd-relax} package\footnote{}, which introduces relaxation to the costly computation of the earth mover's distance.
% In order to rerank the documents, we also experimented with the word mover's distance. It represents semantic differences between sentences using word embeddings. The cosine similarity does not necessarily represent semantic differences accurately, as it measures the cosine of the angle between two vectors, in this context sentence embeddings, but does not take the length of these vectors into account. As a reranking approach to this cosine similarity based ranking, we implemented the word mover's distance. It is built on an initial retrieval using its inherent cosine similarity, on which a reranking on the similarity of conclusion to topic, and premise to respective conclusion. The ranking score of each sentence pair is calculated as weighted sum of these two similarity values.
\subsubsection{Graph based reranking}
\citeauthor{pagerank_graph}\cite{pagerank_graph} have proposed a graph-based approach to measure relevance based on structural connections between argument units objectively. Their hypothesis states that the content of arguments does not determine their relevance. The reasoning behind this hypothesis is the subjectivity of the content of an argument. Their proposed approach infers argument relevance from the number of arguments whose conclusions serve as a premise for other arguments. Further, the approach incorporates the intrinsic relevance of those arguments in a recursive fashion. The authors adopt vital components of the PageRank algorithm using a framework of argument graphs, where arguments represent nodes and the reuse of conclusions as premises determines an edge between nodes. An edge is constructed based on an interpretation function. The authors use an exact string match as an interpretation function.
Using the nested structure of conclusions and premises provided by the initial retrieval, we initially model one argument graph for each topic using the \textit{networkX} \cite{SciPyProceedings_11} graph processing library. For edge interpretation, we used the vector embeddings of the initial retrieval and calculated the cosine similarity between each premise and all the other conclusions. If an interpretation threshold of .99 was surpassed, we would create an edge. We abandoned this approach since we observed that connectivity was overwhelmingly high. The high connectivity can be attributed to the initial retrieval that is also based on the cosine similarity. Due to the highly skewed distribution of cosine similarity (Figure \ref{fig:cosineWmd}), there were few highly connected argument nodes, and a majority of nodes had only a single connection to another argument (Appendix \ref{app:cosineDegreeHistogramm}). We theorized that applying a page rank for arguments would not lead to any meaningful reranking scores.
Motivated by this initial setback, we tried to transform the WMD into a similarity as an interpretation function, which we call word mover's similarity (WMS). WMS is gained by the following transformation \(wms(s_1,s_2) = \frac{1}{1+wmd(s_1,s_2)}\). For this exploration, we assessed an initial interpretation threshold of .2 that must be surpassed to draw an edge between two arguments. The general distribution differs tremendously from the cosine similarities (Figure \ref{fig:cosineWmd}). Nevertheless, similar to the argument graphs generated using cosine similarity, the node degree distribution is also skewed (Appendix \ref{app:wmdArgumentEdges}).
Next, we examined the total amount of edges for every topic for both interpretation functions (Appendix \ref{app:cosineArgumentEdges} and \ref{app:wmdArgumentEdges}). Due to the much lower threshold of the WMS, the general magnitude of total edges in the argument graphs is higher. Increasing the threshold would have led to topics having no argument graph because no WMS score would have surpassed the threshold.
This observation sparked doubts about the applicability of the chosen similarity functions and thresholds. We believe that to generate non degenerated graphs, a threshold for each topic must be chosen individually.
%Due to a contribution skew in this project and the resulting over utilization of certain team members \footnote{}, we were not able to fully develop this approach.
Further, in theory, some edges could attribute the wrong arguments due to the initial deduplication of the indexed dataset. There, we could not enforce that a particular sentence is linked to the argument id of the original argument. Due to these challenges, we did not further investigate a reranking based on argument graphs.
\captionof{figure}{\label{fig:cosineWmd}Comparison of the distribution of similarities between Word Mover's Similarity and Cosine Similarity.\small Word Mover's Similarity is gained by rescaling the Word Mover's Distance. Similarity values were gained by combining the similarities of the constructed argument graphs overall provided sample topics. Argument graph construction using word mover's similarity used an interpretation threshold of .2 to create an edge between two arguments, and the argument graph construction using cosine similarity used a threshold of .99.}
% Graph based reranking of argumentative sentence-pairs is inspired by the proposed framework of Wachsmuth et al.\cite{pagerank_graph}.
% Since it is not whole documents that are indexed and queried, but individual sentence pairs that originate in one of these documents, some changes have been made.
% The reranking is defined as follows: at first an adjacency matrix is calculated, describing which conclusion is reused as premise in a different sentence pair and which premise belongs to a conclusion within the same sentence pair. The decision whether a conclusion matches a premise in a different sentence pair is made by calculating the cosine similarity on their vector embeddings and checking if it is greater than \(0.95\). After that, the PageRank algorithm is applied to this adjacency matrix to find those conclusions that are most often reused as premises. For a fictional example with conclusions \(c_1, c_2, c_3\), 2 premises per conclusion and \(c_3\) reused as premise \(p_{22}\), this process is illustrated in \hyperref[fig:graphProcess]{Figure \ref{fig:graphProcess}}.
% First experimental retrieval runs revealed that the similarity threshold of 0.95 often led to a tremendous number of conclusions and premises classified as the same, leading to an almost fully connected graph. Furthermore, this phenomenon was different for each specific topic, so finding a reasonable threshold that fits a retrieval system for general purpose was not possible to our knowledge. Another reason that might have influenced the outcome negatively was the partially dirty content of the corpus, as some sentences were obviously not full sentences or just text snippets from footnotes or sources. Additionally, the sole usage of the initial retrieval could have been problematic, as it only relies on finding similar sentences regarding content, which automatically leads to a large number of sentences that are quite similar regarding their cosine distance in a vector embedding. Therefore, we refrained from implementing this reranking approach and focussed on other methods to improve our results.
% An instance of this problem with 100 conclusions and 50 premises is visualized in Figure \ref{fig:igel}, which describes a highly connected graph on which an application of a pagerank algorithm is problematic.
% \begin{figure}[h]
% \centering
% \includegraphics[width=0.5\linewidth]{igelgraph.jpg}
% \caption{Graph for topic 51 with 100 conclusions and 50 premises per conclusion. Nodes represent arguments, edges connect arguments with cosine similarity > 0.99 for respective conclusions and premises, nodes with degree < 2 were filtered out for visual clarity}
% \label{fig:igel}
% \end{figure}
% %Reranking according to argument relevance, as brought forward by Wachsmuth et al.(?)\cite{pagerank_graph}, was an intriguing approach that we wanted to further assess.
\subsubsection{Final Scoring}
Similar to the initial retrieval and the calculation of the different reranking scores, our software calculates the final reranking score separately for conclusions and premise. For WMD and SD, we assess the final score as a weighted sum between the initial cosine similarity and the respective reranking score (Equation \ref{eq:finalScoring}). MMR does not need a weighted sum, as the MMR itself does, already include information of the initial reranking score. Where \(S\) denotes the final score between a query \(q\) and a document \(d\), \(I\) the initial ranking score, and \(R\) the respective reranking score. We scale both \(I\) and \(R\) into the interval of \([0,1]\). The parameter \(\mu\) was naively tuned by a heuristic evaluation using a debugger attached to our program runtime. For our final evaluations \(\mu\) was set to the values .9 and .75 for conclusion and premises, respectively.
S_{(q,d)} = \mu * I(q, d) + (1-\mu) * R(q, d)
\section{Evaluation} \label{sec:4}
We performed a manual evaluation of a subset of retrieved topics to evaluate the effectiveness of the presented retrieval and reranking approaches. For each of the first five topics, 100 sentence pairs were retrieved and ranked. After that, the Top 20 sentence pairs for each topic were assessed by hand according to relevance and quality criteria. The relevance criterion captures how well the content of the sentence pair fits the topic's content. Quality measures how a good a sentence pair is regarding