Commit 6655fa0e authored by Janos Borst's avatar Janos Borst
Browse files

master merge

parent 4bd340a6
Pipeline #36804 passed with stages
in 2 minutes and 55 seconds
......@@ -78,7 +78,7 @@ class ZAGCNNLM2(TextClassificationAbstract):
from ..representation import get_word_embedding_mean
l = get_word_embedding_mean(
[" ".join(re.split("[/ _-]", x.lower())) for x in self.classes.keys()],
"/disk1/users/jborst/Data/Embeddings/glove/en/glove.6B.300d_small.txt")
"glove300")
if scale=="mean":
print("subtracting mean")
......
import numpy as np
from transformers import *
import torch
from pathlib import Path
from urllib import error
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile
EMBEDDINGCACHE = Path.home() / ".mlmc" / "embedding"
MODELS = {"bert": (BertModel, BertTokenizer, 'bert-large-uncased'),
"bert_cased": (BertModel, BertTokenizer, 'bert-base-cased'),
......@@ -16,8 +24,26 @@ MODELS = {"bert": (BertModel, BertTokenizer, 'bert-large-uncased'),
def load_static(embedding="/disk1/users/jborst/Data/Embeddings/glove/en/glove.6B.50d_small.txt"):
glove = np.loadtxt(embedding, dtype='str', comments=None)
def load_static(embedding="glove300"):
embeddingfiles = {"glove50": "glove.6B.50d.txt",
"glove100": "glove.6B.100d.txt",
"glove200": "glove.6B.200d.txt",
"glove300": "glove.6B.300d.txt"}
if not (EMBEDDINGCACHE / embeddingfiles[embedding]).exists():
URL ="http://nlp.stanford.edu/data/glove.6B.zip"
try:
resp = urlopen(URL)
except error.HTTPError:
print(error.HTTPError)
return None
assert resp.getcode() == 200, "Download not found Error: (%i)" % (resp.getcode(),)
print("Downloading glove vectors... This may take a while...")
zipfile = ZipFile(BytesIO(resp.read()))
zipfile.extractall(EMBEDDINGCACHE)
fp = EMBEDDINGCACHE / embeddingfiles[embedding]
glove = np.loadtxt(fp, dtype='str', comments=None)
glove = glove[np.unique(glove[:,:1],axis=0, return_index=True)[1]]
words = glove[:, 0]
weights = glove[:, 1:].astype('float')
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment