Skip to content

Commit d3f0ee5

Browse files
committed
OPENNLP-1122: Leipzig sample should allow skip initial entries
1 parent f82b5b5 commit d3f0ee5

File tree

2 files changed

+65
-2
lines changed

2 files changed

+65
-2
lines changed

opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import java.io.IOException;
2222

2323
import opennlp.tools.cmdline.ArgumentParser;
24+
import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
2425
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
2526
import opennlp.tools.cmdline.StreamFactoryRegistry;
2627
import opennlp.tools.cmdline.TerminateToolException;
@@ -47,6 +48,11 @@ interface Parameters extends EncodingParameter {
4748
@ParameterDescription(valueName = "samplesPerLanguage",
4849
description = "number of samples per language")
4950
String getSamplesPerLanguage();
51+
52+
@ParameterDescription(valueName = "samplesToSkip",
53+
description = "number of samples to skip before returning")
54+
@OptionalParameter(defaultValue = "0")
55+
String getSamplesToSkip();
5056
}
5157

5258
protected <P> LeipzigLanguageSampleStreamFactory(Class<P> params) {
@@ -64,9 +70,11 @@ public ObjectStream<LanguageSample> create(String[] args) {
6470
File sentencesFileDir = params.getSentencesDir();
6571

6672
try {
67-
return new SampleShuffleStream(new LeipzigLanguageSampleStream(sentencesFileDir,
73+
return new SampleSkipStream(new SampleShuffleStream(
74+
new LeipzigLanguageSampleStream(sentencesFileDir,
6875
Integer.parseInt(params.getSentencesPerSample()),
69-
Integer.parseInt(params.getSamplesPerLanguage())));
76+
Integer.parseInt(params.getSamplesPerLanguage()) + Integer.parseInt(params.getSamplesToSkip()))),
77+
Integer.parseInt(params.getSamplesToSkip()));
7078
} catch (IOException e) {
7179
throw new TerminateToolException(-1, "IO error while opening sample data.", e);
7280
}
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package opennlp.tools.formats.leipzig;
19+
20+
import java.io.IOException;
21+
22+
import opennlp.tools.util.ObjectStream;
23+
24+
class SampleSkipStream<T> implements ObjectStream<T> {
25+
26+
27+
private final ObjectStream<T> samples;
28+
private final int samplesToSkip;
29+
30+
SampleSkipStream(ObjectStream<T> samples, int samplesToSkip) throws IOException {
31+
this.samples = samples;
32+
this.samplesToSkip = samplesToSkip;
33+
34+
skipSamples();
35+
}
36+
37+
@Override
38+
public T read() throws IOException {
39+
return samples.read();
40+
}
41+
42+
@Override
43+
public void reset() throws IOException, UnsupportedOperationException {
44+
this.samples.reset();
45+
skipSamples();
46+
}
47+
48+
private void skipSamples() throws IOException {
49+
int i = 0;
50+
51+
while (i < samplesToSkip && (samples.read()) != null) {
52+
i++;
53+
}
54+
}
55+
}

0 commit comments

Comments
 (0)