6
6
wget 'https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip'
7
7
unzip java.zip
8
8
python notebooks/codesearchnet-opennmt.py \
9
- --data_dir ='java/final/jsonl/valid' \
9
+ --data-dir ='java/final/jsonl/valid' \
10
10
--newline='\\ n'
11
11
"""
12
12
from argparse import ArgumentParser , Namespace
20
20
21
21
logging .basicConfig (level = logging .INFO )
22
22
23
+ # catch SIGPIPE to make it nix CLI friendly e.g. | head
24
+ from signal import signal , SIGPIPE , SIG_DFL
23
25
24
- class CodeSearchNetRAM (object ):
26
+ signal (SIGPIPE , SIG_DFL )
27
+
28
+
29
+ class CodeSearchNetRAM :
25
30
"""Stores one split of CodeSearchNet data in memory"""
26
31
27
32
def __init__ (self , split_path : Path , newline_repl : str ):
@@ -64,13 +69,10 @@ def __getitem__(self, idx: int) -> Tuple[str, str]:
64
69
65
70
# drop fn signature
66
71
code = row ["code" ]
67
- fn_body = (
68
- code [
69
- code .find ("{" , code .find (fn_name ) + len (fn_name )) + 1 : code .rfind ("}" )
70
- ]
71
- .lstrip ()
72
- .rstrip ()
73
- )
72
+ fn_body = code [
73
+ code .find ("{" , code .find (fn_name ) + len (fn_name )) + 1 : code .rfind ("}" )
74
+ ]
75
+ fn_body = fn_body .strip ()
74
76
fn_body = fn_body .replace ("\n " , self .newline_repl )
75
77
# fn_body_enc = self.enc.encode(fn_body)
76
78
@@ -111,9 +113,7 @@ def main(args: Namespace) -> None:
111
113
help = "Path to the unziped input data (CodeSearchNet)" ,
112
114
)
113
115
114
- parser .add_argument (
115
- "--newline" , type = str , default = "\\ n" , help = "Replace newline with this"
116
- )
116
+ parser .add_argument ("--newline" , default = "\\ n" , help = "Replace newline with this" )
117
117
118
118
parser .add_argument (
119
119
"--token-level-sources" ,
@@ -128,14 +128,11 @@ def main(args: Namespace) -> None:
128
128
)
129
129
130
130
parser .add_argument (
131
- "--src_file" ,
132
- type = str ,
133
- default = "src-%s.token" ,
134
- help = "File with function bodies" ,
131
+ "--src-file" , default = "src-%s.token" , help = "File with function bodies" ,
135
132
)
136
133
137
134
parser .add_argument (
138
- "--tgt_file" , type = str , default = "tgt-%s.token" , help = "File with function texts"
135
+ "--tgt-file" , default = "tgt-%s.token" , help = "File with function texts"
139
136
)
140
137
141
138
parser .add_argument (
0 commit comments