Commit f95a65e1 authored by Dominik Schwabe's avatar Dominik Schwabe
Browse files

setup project

parents
experiments.py
# Created by https://www.toptal.com/developers/gitignore/api/vim,python,react
# Edit at https://www.toptal.com/developers/gitignore?templates=vim,python,react
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
### react ###
.DS_*
logs
**/*.backup.*
**/*.back.*
node_modules
bower_components
*.sublime*
psd
thumb
sketch
### Vim ###
# Swap
[._]*.s[a-v][a-z]
!*.svg # comment out if you don't need vector files
[._]*.sw[a-p]
[._]s[a-rt-v][a-z]
[._]ss[a-gi-z]
[._]sw[a-p]
# Session
Session.vim
Sessionx.vim
# Temporary
.netrwhist
*~
# Auto-generated tag files
tags
# Persistent undo
[._]*.un~
# End of https://www.toptal.com/developers/gitignore/api/vim,python,react
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
sklearn = "*"
nltk = "*"
spacy = "*"
pandas = "*"
newspaper3k = "*"
langdetect = "*"
[dev-packages]
ipython = "*"
ipdb = "*"
ipython-autoimport = "*"
[requires]
python_version = "3.8"
This diff is collapsed.
import pandas as pd
from pandas.core.dtypes.common import is_period_dtype
import spacy
import langdetect
from summarizer.util import Tokenizer
from .tfidf import tfidf_score
from .position import position_score
from .average_lexical_connectivity import average_lexical_connectivity
from .stopword_ratio import stopword_ratio
from .word_overlap import WordOverlap
from .length import length_score
from .rank import rank_score
MODELS = {"en": "en_core_web_sm", "de": "de_core_news_sm"}
def load_model(model):
try:
return spacy.load(model)
except OSError:
print(f"model {model} not present")
print(f"downloading model {model}")
spacy.cli.download(model)
return spacy.load(model)
class Scorer:
def __init__(self, lang="en", rank_score_limit=3, raise_invalid_lang=True):
self.lang = lang.lower()
self.rank_score_limit = rank_score_limit
self.raise_invalid_lang = raise_invalid_lang
model = MODELS[self.lang]
nlp = load_model(model)
self.tokenizer = Tokenizer(nlp)
self.nlp = nlp
def get_features(self, document, title=None):
if self.raise_invalid_lang:
detected_lang = langdetect.detect(document)
if detected_lang != self.lang:
raise ValueError(f"the language of the text ({detected_lang}) and of the loaded model ({self.lang}) do not match")
sentences = self.tokenizer.tokenize(document)
if not sentences:
return []
scores = pd.DataFrame()
scores["tfidf"] = tfidf_score(sentences, use_lemma=True, sum=False)
scores["position"] = position_score(sentences, linear=False, use_exp=True)
scores["average_lexical_connectivity"] = average_lexical_connectivity(sentences)
scores["stopword_ratio"] = stopword_ratio(sentences)
scores["length"] = length_score(sentences)
if title is not None:
scores["word_overlap"] = WordOverlap(title, nlp=self.nlp).score(sentences)
scores["rank"] = rank_score(
sentences, scores.prod(axis=1), limit=self.rank_score_limit
)
sentences = [sent.text.strip() for sent in sentences]
return sentences, scores
def score(self, document, title=None):
sentences, scores = self.get_features(document, title)
scores = scores.prod(axis=1).values
return sentences, scores
def summarize(self, document, title=None, num_sentences=3):
sentences, scores = self.score(document, title)
df = pd.DataFrame({"sentences": sentences, "scores": scores})
df = df.reset_index()
df = df.sort_values("scores", ascending=False)
df = df[:num_sentences]
df = df.sort_values("index")
summary = " ".join(df["sentences"])
return summary
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from summarizer.util import filter_tokens
from functools import partial
def average_lexical_connectivity(sentences, use_exp=True):
"""number of terms in a sentences shared with other sentences
devided by the number of sentences"""
analyzer = partial(filter_tokens, use_lemma=True)
vectorizer = CountVectorizer(binary=True, analyzer=analyzer)
try:
matrix = vectorizer.fit_transform(sentences)
except ValueError:
return [1] * len(sentences)
shared_terms = np.asarray(matrix.sum(axis=0)).ravel() > 1
features = []
for row_index in range(matrix.shape[0]):
feature = shared_terms[matrix[row_index].nonzero()[1]].sum()
features.append(feature)
scores = np.array(features) / len(sentences)
if use_exp:
scores = np.exp(scores)
return scores
import numpy as np
from summarizer.util import filter_tokens
def length_score(sentences, use_exp=True):
"""length of the sentence devided by the maximum sentence length
"""
sent_length = np.array([len(filter_tokens(sent)) for sent in sentences])
scores = sent_length/np.max((np.max(sent_length), 1))
if use_exp:
scores = np.exp(scores)
return scores
import numpy as np
def inverse_score(sentences, start=3):
"""independent of number of sentences"""
return 1 / np.arange(start, len(sentences) + start)
def linear_score(sentences):
"""dependent of number of sentences"""
return np.linspace(1, 0, len(sentences), endpoint=False)
def position_score(sentences, linear=True, use_exp=False, inverse_start=3):
"""gives sentences at the beginning higher scores
"""
scores = linear_score(sentences) if linear else inverse_score(sentences, start=inverse_start)
if use_exp:
scores = np.exp(scores)
return scores
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from functools import partial
from itertools import chain
from summarizer.util import filter_tokens
def rank_score(sentences, scores, limit=3, use_exp=True):
"""cosine similarity to the highest ranked sentences
in the tfidf matrix. the highest ranked sentences are
combined into one document.
"""
if len(sentences) <= limit:
rank_scores = np.ones(len(sentences))
else:
df = pd.DataFrame({"sentences": sentences, "scores": scores})
df.reset_index(inplace=True)
df.sort_values("scores", inplace=True, ascending=False)
query = df["sentences"][:limit]
docs = df["sentences"][limit:]
analyzer = partial(filter_tokens, use_lemma=True)
vectorizer = TfidfVectorizer(analyzer=analyzer, smooth_idf=True)
query = list(chain.from_iterable(query))
tfidf = vectorizer.fit_transform(docs)
query_vec = vectorizer.transform([query])
rank_scores = cosine_similarity(query_vec, tfidf).ravel()
rank_scores = np.concatenate((np.ones(limit), rank_scores))
rank_scores = pd.Series(rank_scores, index=df["index"])
rank_scores = rank_scores.sort_index().values
if use_exp:
rank_scores = np.exp(rank_scores)
return rank_scores
import numpy as np
ARTICLES = {"a", "an"}
SPECIAL_POS_TAGS = {"NOUN", "ADJ", "PROPN"}
def has_special_feature(token):
return (
token.is_currency
or token.is_digit
or (token.text in ARTICLES and token.pos_ == "DET")
or token.pos_ in SPECIAL_POS_TAGS
or token.ent_type_ != ""
# or token.ent_type_ == "DATE"
)
def special_token_score(sentences):
features = []
for sentence in sentences:
score = 0
for token in sentence:
if has_special_feature(token):
score += 1
features.append(score)
return np.array(features)
import spacy
import numpy as np
from summarizer.util import filter_tokens
def get_stopwords(sentence):
return [token for token in sentence if token.is_stop]
def stopword_ratio(sentences):
"""number of non-stopwords in the sentence devided by the
number of words in the sentence"""
return 1 - np.array(
[len(get_stopwords(sent)) / np.max((len(filter_tokens(sent, remove_stopwords=False)), 1)) for sent in sentences]
)
# TODO: length penalty
# TODO: try normalized tfidf (length normalized)
from sklearn.feature_extraction.text import TfidfVectorizer
from functools import partial
import spacy
import numpy as np
from summarizer.util import filter_tokens
def sum_feature(matrix):
"""add up the entries of the matrix for each row"""
return np.asarray(matrix.sum(axis=1)).ravel()
def mul_feature(matrix):
"""add 1 to each entry in the matrix and multiply
the entries for each row"""
features = []
for row_index in range(matrix.shape[0]):
feature = (matrix[row_index].data + 1).prod()
features.append(feature)
return np.array(features)
def tfidf_score(sentences, sum=True, smooth_idf=False, use_lemma=False):
"""compute the tfidf vectors and add the entries up by
summation or multiplication after adding 1
"""
analyzer = partial(filter_tokens, use_lemma=use_lemma)
vectorizer = TfidfVectorizer(analyzer=analyzer, smooth_idf=smooth_idf)
try:
tfidf = vectorizer.fit_transform(sentences)
except ValueError:
return np.ones(len(sentences))
features = sum_feature(tfidf) if sum else mul_feature(tfidf)
return features
import spacy
import numpy as np
from summarizer.util import Tokenizer, filter_tokens
class WordOverlap:
def __init__(self, words, nlp):
if isinstance(words, str):
words = filter_tokens(Tokenizer(nlp).tokenize(words), use_lemma=True)
else:
words = words.copy()
self.words = set(words)
def score(self, sentences, log_smooth=True):
"""number of words shared with the title"""
features = []
for sentence in sentences:
words = set(filter_tokens(sentence, use_lemma=True))
feature = len(words & self.words)
features.append(feature)
features = np.array(features)
if log_smooth:
features = np.log(features + 1) + 1
else:
features += 1
return features
import spacy
from newspaper import Article
def download_article(url):
article = Article(url)
article.download()
article.parse()
return {"text": article.text, "title": article.title}
class Tokenizer:
def __init__(self, nlp):
self.nlp = nlp
def tokenize(self, document):
return [sent for sent in self.nlp(document).sents if sent.text.strip() != ""]
def filter_tokens(tokens, remove_stopwords=True, use_lemma=False):
return [
token.lemma_.lower() if use_lemma else token.text
for token in tokens
if (not remove_stopwords or not token.is_stop)
and not token.is_punct
and not token.is_space
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment