Skip to content

Commit 992af7e

Browse files
authored
Merge pull request #282 from rssdev10/fix/style_improvement
Fix/style improvement
2 parents 3ec01c8 + 0230573 commit 992af7e

31 files changed

+538
-502
lines changed

docs/make.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
using Documenter, TextAnalysis
22

33
makedocs(
4-
modules = [TextAnalysis],
5-
sitename = "TextAnalysis",
6-
format = Documenter.HTML(
4+
modules=[TextAnalysis],
5+
sitename="TextAnalysis",
6+
format=Documenter.HTML(
77
),
8-
pages = [
8+
pages=[
99
"Home" => "index.md",
1010
"Documents" => "documents.md",
1111
"Corpus" => "corpus.md",

src/LM/api.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ It is used to evaluate score with masks out of vocabulary words
66
The arguments are the same as for [`score`](@ref)
77
"""
88
function maskedscore(m::Langmodel, temp_lm::DefaultDict, word, context)::Float64
9-
score(m, temp_lm, lookup(m.vocab, [word])[begin], lookup(m.vocab, [context])[begin])
9+
score(m, temp_lm, lookup(m.vocab, [word])[begin], lookup(m.vocab, [context])[begin])
1010
end
1111

1212
"""

src/LM/langmodel.jl

Lines changed: 51 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@ abstract type Langmodel end
22
abstract type gammamodel <: Langmodel end #BaseNgram with Add-one smoothing algo
33
abstract type InterpolatedLanguageModel <: Langmodel end #Interpolated language model with smoothing
44

5-
#DataType MLE
6-
#Type for providing MLE ngram model scores.
7-
#Implementation of Base Ngram Model.
5+
# DataType MLE
6+
# Type for providing MLE ngram model scores.
7+
# Implementation of Base Ngram Model.
88

99
struct MLE <: Langmodel
1010
vocab::Vocabulary
@@ -18,13 +18,13 @@ Initiate Type for providing MLE ngram model scores.
1818
Implementation of Base Ngram Model.
1919
2020
"""
21-
function MLE(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
21+
function MLE(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
2222
MLE(Vocabulary(word, unk_cutoff, unk_label))
2323
end
2424

25-
function (lm::MLE)(text::Vector{T}, min::Integer, max::Integer) where {T <: AbstractString}
25+
function (lm::MLE)(text::Vector{T}, min::Integer, max::Integer) where {T<:AbstractString}
2626
text = lookup(lm.vocab, text)
27-
text=convert(Array{String}, text)
27+
text = convert(Array{String}, text)
2828
return counter2(text, min, max)
2929
end
3030

@@ -41,18 +41,19 @@ Function to initiate Type(Lidstone) for providing Lidstone-smoothed scores.
4141
In addition to initialization arguments from BaseNgramModel also requires
4242
a number by which to increase the counts, gamma.
4343
"""
44-
function Lidstone(word::Vector{T}, gamma = 1.0, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
44+
function Lidstone(word::Vector{T}, gamma=1.0, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
4545
Lidstone(Vocabulary(word, unk_cutoff, unk_label), gamma)
4646
end
4747

48-
function (lm::Lidstone)(text::Vector{T}, min::Integer, max::Integer) where {T <: AbstractString}
48+
function (lm::Lidstone)(text::Vector{T}, min::Integer, max::Integer) where {T<:AbstractString}
4949
text = lookup(lm.vocab, text)
50-
text=convert(Array{String}, text)
50+
text = convert(Array{String}, text)
5151
return counter2(text, min, max)
5252
end
5353

5454
"""
5555
Laplace(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
56+
5657
Function to initiate Type(Laplace) for providing Laplace-smoothed scores.
5758
5859
In addition to initialization arguments from BaseNgramModel also requires
@@ -63,11 +64,11 @@ struct Laplace <: gammamodel
6364
gamma::Float64
6465
end
6566

66-
function Laplace(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
67+
function Laplace(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
6768
Laplace(Vocabulary(word, unk_cutoff, unk_label), 1.0)
6869
end
6970

70-
function (lm::Laplace)(text, min::Integer, max::Integer)
71+
function (lm::Laplace)(text, min::Integer, max::Integer)
7172
text = lookup(lm.vocab, text)
7273
text = convert(Array{String}, text)
7374
return counter2(text, min, max)
@@ -84,35 +85,32 @@ Add-one smoothing to Lidstone or Laplace(gammamodel) models
8485
function score(m::gammamodel, temp_lm::DefaultDict, word, context) #score for gammamodel output probabl
8586
accum = temp_lm[context]
8687
#print(accum)
87-
s = float(sum(accum)+(m.gamma)*length(m.vocab.vocab))
88-
for (text, count) in accum
89-
if text == word
90-
return(float(count+m.gamma)/s)
91-
end
92-
end
93-
return(float(m.gamma)/s)
88+
s = float(sum(accum) + (m.gamma) * length(m.vocab.vocab))
89+
idx = something(findfirst(isequal(word), accum), 0)
90+
return float(idx + m.gamma) / s
9491
end
9592

9693
"""
94+
$(TYPEDSIGNATURES)
95+
9796
To get probability of word given that context
9897
9998
In other words, for given context calculate frequency distribution of word
100-
10199
"""
102100
function prob(m::Langmodel, templ_lm::DefaultDict, word, context=nothing)::Float64
103-
(isnothing(context) || isempty(context)) && return 1.0/length(templ_lm) #provide distribution
101+
(isnothing(context) || isempty(context)) && return 1.0 / length(templ_lm) #provide distribution
104102

105103
accum = templ_lm[context]
106-
s = float(sum(accum))
104+
s = float(sum(accum))
107105
for (text, count) in accum
108106
if text == word
109-
return(float(count) / s)
107+
return (float(count) / s)
110108
end
111109
end
112110
if context in keys(m.vocab.vocab)
113111
return 0.0
114112
end
115-
return(Inf)
113+
return (Inf)
116114
end
117115

118116
"""
@@ -125,8 +123,8 @@ function score(m::MLE, temp_lm::DefaultDict, word, context=nothing)
125123
prob(m, temp_lm, word, context)
126124
end
127125

128-
struct WittenBellInterpolated <: InterpolatedLanguageModel
129-
vocab ::Vocabulary
126+
struct WittenBellInterpolated <: InterpolatedLanguageModel
127+
vocab::Vocabulary
130128
end
131129

132130
"""
@@ -137,41 +135,41 @@ Initiate Type for providing Interpolated version of Witten-Bell smoothing.
137135
The idea to abstract this comes from Chen & Goodman 1995.
138136
139137
"""
140-
function WittenBellInterpolated(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
138+
function WittenBellInterpolated(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
141139
WittenBellInterpolated(Vocabulary(word, unk_cutoff, unk_label))
142140
end
143141

144-
function (lm::WittenBellInterpolated)(text::Vector{T}, min::Integer, max::Integer) where {T <: AbstractString}
142+
function (lm::WittenBellInterpolated)(text::Vector{T}, min::Integer, max::Integer) where {T<:AbstractString}
145143
text = lookup(lm.vocab, text)
146-
text=convert(Array{String}, text)
144+
text = convert(Array{String}, text)
147145
return counter2(text, min, max)
148146
end
149147
# alpha_gamma function for KneserNeyInterpolated
150148
function alpha_gammma(m::WittenBellInterpolated, templ_lm::DefaultDict, word, context)
151149
local alpha
152150
local gam
153151
accum = templ_lm[context]
154-
s = float(sum(accum))
155-
for (text,count) in accum
152+
s = float(sum(accum))
153+
for (text, count) in accum
156154
if text == word
157-
alpha=(float(count) / s)
158-
break
155+
alpha = (float(count) / s)
156+
break
159157
else
160-
alpha = 1/s
158+
alpha = 1 / s
161159
end
162160
end
163-
161+
164162
gam = gamma(accum)
165-
return alpha*(1- gam), gam
163+
return alpha * (1 - gam), gam
166164
end
167165

168166
function count_non_zero_vals(accum::Accumulator{})
169-
return(length(accum))
167+
return (length(accum))
170168
end
171-
169+
172170
function gamma(accum)
173-
nplus=count_non_zero_vals(accum)
174-
return(nplus/(nplus+float(sum(accum))))
171+
nplus = count_non_zero_vals(accum)
172+
return (nplus / (nplus + float(sum(accum))))
175173
end
176174

177175
"""
@@ -187,20 +185,20 @@ function score(m::InterpolatedLanguageModel, temp_lm::DefaultDict, word, context
187185
(isnothing(context) || isempty(context)) && return prob(m, temp_lm, word)
188186

189187
if context in keys(temp_lm)
190-
alpha,gamma = alpha_gammma(m, temp_lm, word, context)
191-
return (alpha + gamma*score(m, temp_lm, word, context_reduce(context)))
188+
alpha, gamma = alpha_gammma(m, temp_lm, word, context)
189+
return (alpha + gamma * score(m, temp_lm, word, context_reduce(context)))
192190
else
193191
return score(m, temp_lm, word, context_reduce(context))
194192
end
195193
end
196-
194+
197195
function context_reduce(context)
198196
context = split(context)
199197
join(context[2:end], " ")
200198
end
201199

202200

203-
struct KneserNeyInterpolated <: InterpolatedLanguageModel
201+
struct KneserNeyInterpolated <: InterpolatedLanguageModel
204202
vocab::Vocabulary
205203
discount::Float64
206204
end
@@ -213,29 +211,29 @@ Initiate Type for providing KneserNey Interpolated language model.
213211
The idea to abstract this comes from Chen & Goodman 1995.
214212
215213
"""
216-
function KneserNeyInterpolated(word::Vector{T}, disc = 0.1, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
217-
KneserNeyInterpolated(Vocabulary(word, unk_cutoff, unk_label) ,disc)
214+
function KneserNeyInterpolated(word::Vector{T}, disc=0.1, unk_cutoff=1, unk_label="<unk>") where {T<:AbstractString}
215+
KneserNeyInterpolated(Vocabulary(word, unk_cutoff, unk_label), disc)
218216
end
219217

220-
function (lm::KneserNeyInterpolated)(text::Vector{T}, min::Integer, max::Integer) where {T <: AbstractString}
218+
function (lm::KneserNeyInterpolated)(text::Vector{T}, min::Integer, max::Integer) where {T<:AbstractString}
221219
text = lookup(lm.vocab, text)
222-
text=convert(Array{String}, text)
220+
text = convert(Array{String}, text)
223221
return counter2(text, min, max)
224222
end
225223
# alpha_gamma function for KneserNeyInterpolated
226224
function alpha_gammma(m::KneserNeyInterpolated, templ_lm::DefaultDict, word, context)
227225
local alpha
228-
local gamma
226+
local gamma
229227
accum = templ_lm[context]
230-
s = float(sum(accum))
228+
s = float(sum(accum))
231229
for (text, count) in accum
232230
if text == word
233-
alpha=(max(float(count)-m.discount, 0.0) / s)
234-
break
231+
alpha = (max(float(count) - m.discount, 0.0) / s)
232+
break
235233
else
236-
alpha = 1/length(m.vocab.vocab)
234+
alpha = 1 / length(m.vocab.vocab)
237235
end
238236
end
239-
gamma = (m.discount * count_non_zero_vals(accum) /s)
237+
gamma = (m.discount * count_non_zero_vals(accum) / s)
240238
return alpha, gamma
241239
end

0 commit comments

Comments
 (0)