Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Wiki Rasa
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
tm-chatbot
Wiki Rasa
Commits
17960c6b
Commit
17960c6b
authored
6 years ago
by
Lucas Schons
Browse files
Options
Downloads
Patches
Plain Diff
Apply styler, fix typo
parent
f5773748
No related branches found
Branches containing commit
No related tags found
Tags containing commit
1 merge request
!47
Resolve "Improve pattern matching function"
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
processing/wikiproc/R/utils.R
+44
-47
44 additions, 47 deletions
processing/wikiproc/R/utils.R
with
44 additions
and
47 deletions
processing/wikiproc/R/utils.R
+
44
−
47
View file @
17960c6b
...
...
@@ -13,7 +13,7 @@
#'
#' infobox <- get_infobox(articles$Text[54])
#'
#' infobox[3:4,]
#' infobox[3:4,
]
#' }
get_infobox
<-
function
(
article
)
{
# Read page as html
...
...
@@ -27,7 +27,7 @@ get_infobox <- function(article) {
xml2
::
xml_find_all
(
page
,
".//br"
)
%>%
xml2
::
xml_remove
(
.
)
# Get the info box
# Will throw an error if there isnt any, so that should be checked beforehand
...
...
@@ -70,19 +70,19 @@ get_infobox <- function(article) {
#' @examples
#' \dontrun{
#' # Find all proper nouns following an and
#'
#'
#' # Both notations are working
#' text <- "and *"
#' pos <- c("*", "PROPN")
#'
#'
#' # Prepare data
#' articles <- get_data()
#' annotations <- create_annotations(
#' clean_html(articles[articles$Title == "Neil Ashby",4]),
#' articles[articles$Title == "Neil Ashby",2],
#' articles[articles$Title == "Neil Ashby",3]
#' clean_html(articles[articles$Title == "Neil Ashby",
4]),
#' articles[articles$Title == "Neil Ashby",
2],
#' articles[articles$Title == "Neil Ashby",
3]
#' )
#'
#'
#' results <- match_pattern(annotations, text, pos, ignore.punct = TRUE)
#' results
#' # [1] "and Frequency" "and Technology" "and Gravitation"
...
...
@@ -96,98 +96,96 @@ match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case
if
(
length
(
pos
)
==
1
)
{
pos
<-
unlist
(
strsplit
(
pos
,
" "
,
fixed
=
TRUE
))
}
# We need at the very least to tokens to define a pattern
# We need at the very least t
w
o tokens to define a pattern
if
(
length
(
text
)
<=
1
|
length
(
pos
)
<=
1
)
{
stop
(
"Error: Input needs to be contain at least two tokens"
)
}
# Check the length of our inputs matches
if
(
length
(
text
)
!=
length
(
pos
))
{
stop
(
"Error: Text and pos need to be of equal length"
)
}
# Let's build the vector we're going to be searching through
# TODO: This way of doing this is probably inefficent
combined
<-
NA
annot
<-
cleanNLP
::
cnlp_get_token
(
annotations
)
# Remove spaces as they would mess up a lot down there
annot
<-
annot
[
annot
$
upos
!=
"SPACE"
,]
annot
<-
annot
[
annot
$
upos
!=
"SPACE"
,
]
# Remove punctuation if that is desired but keep sentence ending marks aka full stops
if
(
ignore.punct
)
{
annot
<-
annot
[
annot
$
upos
!=
"PUNCT"
||
annot
$
word
==
"."
,]
if
(
ignore.punct
)
{
annot
<-
annot
[
annot
$
upos
!=
"PUNCT"
||
annot
$
word
==
"."
,
]
}
# Now to actually constructing the vector
if
(
use.stems
)
{
if
(
use.stems
)
{
combined
<-
paste
(
annot
$
word
,
annot
$
lemma
,
annot
$
upos
,
sep
=
"\u001F"
)
}
else
{
combined
<-
paste
(
annot
$
word
,
annot
$
upos
,
sep
=
"\u001F"
)
}
# We could also use something else to collapse, we would slightly need to modify the query
# but we could skip removing spaces. If that is worth anything?
# Also appending an extra space so we can match the last element
combined
<-
paste0
(
paste
(
combined
,
collapse
=
" "
),
" "
)
# Now let us construct the regex
expr
<-
sapply
(
seq_along
(
text
),
function
(
x
)
{
# Create regex vector
re
<-
character
()
# If using stems we want to ignore the first part of the data
# as it contains the original word for later use
if
(
use.stems
)
{
if
(
use.stems
)
{
# Match everything up to the seperator value
re
<-
"([^ ])+?\u001F"
}
# If we get a wildcard we match anything but spaces that comes before a #
# If not we match exactly the word we got
if
(
text
[
x
]
==
"*"
)
{
if
(
text
[
x
]
==
"*"
)
{
re
<-
paste0
(
re
,
"([^ ])+?\u001F"
)
}
else
{
re
<-
paste0
(
re
,
paste0
(
text
[
x
],
"\u001F"
))
}
# Now we do a very similar thing for the pos tags
# Except we know they will only contain capital letters, always
if
(
pos
[
x
]
==
"*"
)
{
if
(
pos
[
x
]
==
"*"
)
{
re
<-
paste0
(
re
,
"([A-Z])+? "
)
}
else
{
re
<-
paste0
(
re
,
paste0
(
pos
[
x
],
" "
))
}
})
# Stitch it all together
expr
<-
paste
(
expr
,
collapse
=
""
)
matches
<-
gregexpr
(
expr
,
combined
,
ignore.case
=
ignore.case
)
# If there was no match we may exit here
if
(
matches
[[
1
]][
1
]
==
-1
)
{
# Explicit returns are bad, i know. But what to do here?
return
(
NA
)
}
# What do we want to do with the result data? This currently extracts it,
# strips the POS tags of and then goes home happy
matches
<-
unlist
(
regmatches
(
combined
,
matches
))
if
(
use.stems
)
{
if
(
use.stems
)
{
matches
<-
sapply
(
matches
,
function
(
match
)
{
match
<-
trimws
(
gsub
(
"\u001F([^ ])+?\u001F([A-Z])+ "
,
" "
,
match
),
which
=
"right"
)
})
...
...
@@ -196,8 +194,7 @@ match_pattern <- function(annotations, text, pos, use.stems = FALSE, ignore.case
match
<-
trimws
(
gsub
(
"\u001F([A-Z])+ "
,
" "
,
match
),
which
=
"right"
)
})
}
unname
(
matches
)
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment