@@ -2,9 +2,9 @@ abstract type Langmodel end
2
2
abstract type gammamodel <: Langmodel end # BaseNgram with Add-one smoothing algo
3
3
abstract type InterpolatedLanguageModel <: Langmodel end # Interpolated language model with smoothing
4
4
5
- # DataType MLE
6
- # Type for providing MLE ngram model scores.
7
- # Implementation of Base Ngram Model.
5
+ # DataType MLE
6
+ # Type for providing MLE ngram model scores.
7
+ # Implementation of Base Ngram Model.
8
8
9
9
struct MLE <: Langmodel
10
10
vocab:: Vocabulary
@@ -18,13 +18,13 @@ Initiate Type for providing MLE ngram model scores.
18
18
Implementation of Base Ngram Model.
19
19
20
20
"""
21
- function MLE (word:: Vector{T} , unk_cutoff= 1 , unk_label= " <unk>" ) where {T <: AbstractString }
21
+ function MLE (word:: Vector{T} , unk_cutoff= 1 , unk_label= " <unk>" ) where {T<: AbstractString }
22
22
MLE (Vocabulary (word, unk_cutoff, unk_label))
23
23
end
24
24
25
- function (lm:: MLE )(text:: Vector{T} , min:: Integer , max:: Integer ) where {T <: AbstractString }
25
+ function (lm:: MLE )(text:: Vector{T} , min:: Integer , max:: Integer ) where {T<: AbstractString }
26
26
text = lookup (lm. vocab, text)
27
- text= convert (Array{String}, text)
27
+ text = convert (Array{String}, text)
28
28
return counter2 (text, min, max)
29
29
end
30
30
@@ -41,18 +41,19 @@ Function to initiate Type(Lidstone) for providing Lidstone-smoothed scores.
41
41
In addition to initialization arguments from BaseNgramModel also requires
42
42
a number by which to increase the counts, gamma.
43
43
"""
44
- function Lidstone (word:: Vector{T} , gamma = 1.0 , unk_cutoff= 1 , unk_label= " <unk>" ) where {T <: AbstractString }
44
+ function Lidstone (word:: Vector{T} , gamma= 1.0 , unk_cutoff= 1 , unk_label= " <unk>" ) where {T<: AbstractString }
45
45
Lidstone (Vocabulary (word, unk_cutoff, unk_label), gamma)
46
46
end
47
47
48
- function (lm:: Lidstone )(text:: Vector{T} , min:: Integer , max:: Integer ) where {T <: AbstractString }
48
+ function (lm:: Lidstone )(text:: Vector{T} , min:: Integer , max:: Integer ) where {T<: AbstractString }
49
49
text = lookup (lm. vocab, text)
50
- text= convert (Array{String}, text)
50
+ text = convert (Array{String}, text)
51
51
return counter2 (text, min, max)
52
52
end
53
53
54
54
"""
55
55
Laplace(word::Vector{T}, unk_cutoff=1, unk_label="<unk>") where {T <: AbstractString}
56
+
56
57
Function to initiate Type(Laplace) for providing Laplace-smoothed scores.
57
58
58
59
In addition to initialization arguments from BaseNgramModel also requires
@@ -63,11 +64,11 @@ struct Laplace <: gammamodel
63
64
gamma:: Float64
64
65
end
65
66
66
- function Laplace (word:: Vector{T} , unk_cutoff= 1 , unk_label= " <unk>" ) where {T <: AbstractString }
67
+ function Laplace (word:: Vector{T} , unk_cutoff= 1 , unk_label= " <unk>" ) where {T<: AbstractString }
67
68
Laplace (Vocabulary (word, unk_cutoff, unk_label), 1.0 )
68
69
end
69
70
70
- function (lm:: Laplace )(text, min:: Integer , max:: Integer )
71
+ function (lm:: Laplace )(text, min:: Integer , max:: Integer )
71
72
text = lookup (lm. vocab, text)
72
73
text = convert (Array{String}, text)
73
74
return counter2 (text, min, max)
@@ -84,35 +85,32 @@ Add-one smoothing to Lidstone or Laplace(gammamodel) models
84
85
function score (m:: gammamodel , temp_lm:: DefaultDict , word, context) # score for gammamodel output probabl
85
86
accum = temp_lm[context]
86
87
# print(accum)
87
- s = float (sum (accum)+ (m. gamma)* length (m. vocab. vocab))
88
- for (text, count) in accum
89
- if text == word
90
- return (float (count+ m. gamma)/ s)
91
- end
92
- end
93
- return (float (m. gamma)/ s)
88
+ s = float (sum (accum) + (m. gamma) * length (m. vocab. vocab))
89
+ idx = something (findfirst (isequal (word), accum), 0 )
90
+ return float (idx + m. gamma) / s
94
91
end
95
92
96
93
"""
94
+ $(TYPEDSIGNATURES)
95
+
97
96
To get probability of word given that context
98
97
99
98
In other words, for given context calculate frequency distribution of word
100
-
101
99
"""
102
100
function prob (m:: Langmodel , templ_lm:: DefaultDict , word, context= nothing ):: Float64
103
- (isnothing (context) || isempty (context)) && return 1.0 / length (templ_lm) # provide distribution
101
+ (isnothing (context) || isempty (context)) && return 1.0 / length (templ_lm) # provide distribution
104
102
105
103
accum = templ_lm[context]
106
- s = float (sum (accum))
104
+ s = float (sum (accum))
107
105
for (text, count) in accum
108
106
if text == word
109
- return (float (count) / s)
107
+ return (float (count) / s)
110
108
end
111
109
end
112
110
if context in keys (m. vocab. vocab)
113
111
return 0.0
114
112
end
115
- return (Inf )
113
+ return (Inf )
116
114
end
117
115
118
116
"""
@@ -125,8 +123,8 @@ function score(m::MLE, temp_lm::DefaultDict, word, context=nothing)
125
123
prob (m, temp_lm, word, context)
126
124
end
127
125
128
- struct WittenBellInterpolated <: InterpolatedLanguageModel
129
- vocab :: Vocabulary
126
+ struct WittenBellInterpolated <: InterpolatedLanguageModel
127
+ vocab:: Vocabulary
130
128
end
131
129
132
130
"""
@@ -137,41 +135,41 @@ Initiate Type for providing Interpolated version of Witten-Bell smoothing.
137
135
The idea to abstract this comes from Chen & Goodman 1995.
138
136
139
137
"""
140
- function WittenBellInterpolated (word:: Vector{T} , unk_cutoff= 1 , unk_label= " <unk>" ) where {T <: AbstractString }
138
+ function WittenBellInterpolated (word:: Vector{T} , unk_cutoff= 1 , unk_label= " <unk>" ) where {T<: AbstractString }
141
139
WittenBellInterpolated (Vocabulary (word, unk_cutoff, unk_label))
142
140
end
143
141
144
- function (lm:: WittenBellInterpolated )(text:: Vector{T} , min:: Integer , max:: Integer ) where {T <: AbstractString }
142
+ function (lm:: WittenBellInterpolated )(text:: Vector{T} , min:: Integer , max:: Integer ) where {T<: AbstractString }
145
143
text = lookup (lm. vocab, text)
146
- text= convert (Array{String}, text)
144
+ text = convert (Array{String}, text)
147
145
return counter2 (text, min, max)
148
146
end
149
147
# alpha_gamma function for KneserNeyInterpolated
150
148
function alpha_gammma (m:: WittenBellInterpolated , templ_lm:: DefaultDict , word, context)
151
149
local alpha
152
150
local gam
153
151
accum = templ_lm[context]
154
- s = float (sum (accum))
155
- for (text,count) in accum
152
+ s = float (sum (accum))
153
+ for (text, count) in accum
156
154
if text == word
157
- alpha= (float (count) / s)
158
- break
155
+ alpha = (float (count) / s)
156
+ break
159
157
else
160
- alpha = 1 / s
158
+ alpha = 1 / s
161
159
end
162
160
end
163
-
161
+
164
162
gam = gamma (accum)
165
- return alpha* ( 1 - gam), gam
163
+ return alpha * ( 1 - gam), gam
166
164
end
167
165
168
166
function count_non_zero_vals (accum:: Accumulator{} )
169
- return (length (accum))
167
+ return (length (accum))
170
168
end
171
-
169
+
172
170
function gamma (accum)
173
- nplus= count_non_zero_vals (accum)
174
- return (nplus/ (nplus+ float (sum (accum))))
171
+ nplus = count_non_zero_vals (accum)
172
+ return (nplus / (nplus + float (sum (accum))))
175
173
end
176
174
177
175
"""
@@ -187,20 +185,20 @@ function score(m::InterpolatedLanguageModel, temp_lm::DefaultDict, word, context
187
185
(isnothing (context) || isempty (context)) && return prob (m, temp_lm, word)
188
186
189
187
if context in keys (temp_lm)
190
- alpha,gamma = alpha_gammma (m, temp_lm, word, context)
191
- return (alpha + gamma* score (m, temp_lm, word, context_reduce (context)))
188
+ alpha, gamma = alpha_gammma (m, temp_lm, word, context)
189
+ return (alpha + gamma * score (m, temp_lm, word, context_reduce (context)))
192
190
else
193
191
return score (m, temp_lm, word, context_reduce (context))
194
192
end
195
193
end
196
-
194
+
197
195
function context_reduce (context)
198
196
context = split (context)
199
197
join (context[2 : end ], " " )
200
198
end
201
199
202
200
203
- struct KneserNeyInterpolated <: InterpolatedLanguageModel
201
+ struct KneserNeyInterpolated <: InterpolatedLanguageModel
204
202
vocab:: Vocabulary
205
203
discount:: Float64
206
204
end
@@ -213,29 +211,29 @@ Initiate Type for providing KneserNey Interpolated language model.
213
211
The idea to abstract this comes from Chen & Goodman 1995.
214
212
215
213
"""
216
- function KneserNeyInterpolated (word:: Vector{T} , disc = 0.1 , unk_cutoff= 1 , unk_label= " <unk>" ) where {T <: AbstractString }
217
- KneserNeyInterpolated (Vocabulary (word, unk_cutoff, unk_label) , disc)
214
+ function KneserNeyInterpolated (word:: Vector{T} , disc= 0.1 , unk_cutoff= 1 , unk_label= " <unk>" ) where {T<: AbstractString }
215
+ KneserNeyInterpolated (Vocabulary (word, unk_cutoff, unk_label), disc)
218
216
end
219
217
220
- function (lm:: KneserNeyInterpolated )(text:: Vector{T} , min:: Integer , max:: Integer ) where {T <: AbstractString }
218
+ function (lm:: KneserNeyInterpolated )(text:: Vector{T} , min:: Integer , max:: Integer ) where {T<: AbstractString }
221
219
text = lookup (lm. vocab, text)
222
- text= convert (Array{String}, text)
220
+ text = convert (Array{String}, text)
223
221
return counter2 (text, min, max)
224
222
end
225
223
# alpha_gamma function for KneserNeyInterpolated
226
224
function alpha_gammma (m:: KneserNeyInterpolated , templ_lm:: DefaultDict , word, context)
227
225
local alpha
228
- local gamma
226
+ local gamma
229
227
accum = templ_lm[context]
230
- s = float (sum (accum))
228
+ s = float (sum (accum))
231
229
for (text, count) in accum
232
230
if text == word
233
- alpha= (max (float (count)- m. discount, 0.0 ) / s)
234
- break
231
+ alpha = (max (float (count) - m. discount, 0.0 ) / s)
232
+ break
235
233
else
236
- alpha = 1 / length (m. vocab. vocab)
234
+ alpha = 1 / length (m. vocab. vocab)
237
235
end
238
236
end
239
- gamma = (m. discount * count_non_zero_vals (accum) / s)
237
+ gamma = (m. discount * count_non_zero_vals (accum) / s)
240
238
return alpha, gamma
241
239
end
0 commit comments