@@ -41,6 +41,13 @@ import org.apache.spark.util.collection.PartitionedSerializedPairBuffer._
41
41
*
42
42
* Currently, only sorting by partition is supported.
43
43
*
44
+ * Each record is laid out inside the the metaBuffer as follows. keyStart, a long, is split across
45
+ * two integers:
46
+ *
47
+ * +-------------+------------+------------+-------------+
48
+ * | keyStart | keyValLen | partitionId |
49
+ * +-------------+------------+------------+-------------+
50
+ *
44
51
* @param metaInitialRecords The initial number of entries in the metadata buffer.
45
52
* @param kvBlockSize The size of each byte buffer in the ChainedBuffer used to store the records.
46
53
* @param serializerInstance the serializer used for serializing inserted records.
@@ -68,19 +75,15 @@ private[spark] class PartitionedSerializedPairBuffer[K, V](
68
75
}
69
76
70
77
val keyStart = kvBuffer.size
71
- if (keyStart < 0 ) {
72
- throw new Exception (s " Can't grow buffer beyond ${1 << 31 } bytes " )
73
- }
74
78
kvSerializationStream.writeKey[Any ](key)
75
- kvSerializationStream.flush()
76
- val valueStart = kvBuffer.size
77
79
kvSerializationStream.writeValue[Any ](value)
78
80
kvSerializationStream.flush()
79
- val valueEnd = kvBuffer.size
81
+ val keyValLen = ( kvBuffer.size - keyStart).toInt
80
82
81
- metaBuffer.put(keyStart)
82
- metaBuffer.put(valueStart)
83
- metaBuffer.put(valueEnd)
83
+ // keyStart, a long, gets split across two ints
84
+ metaBuffer.put(keyStart.toInt)
85
+ metaBuffer.put((keyStart >> 32 ).toInt)
86
+ metaBuffer.put(keyValLen)
84
87
metaBuffer.put(partition)
85
88
}
86
89
@@ -114,7 +117,7 @@ private[spark] class PartitionedSerializedPairBuffer[K, V](
114
117
}
115
118
}
116
119
117
- override def estimateSize : Long = metaBuffer.capacity * 4 + kvBuffer.capacity
120
+ override def estimateSize : Long = metaBuffer.capacity * 4L + kvBuffer.capacity
118
121
119
122
override def destructiveSortedWritablePartitionedIterator (keyComparator : Option [Comparator [K ]])
120
123
: WritablePartitionedIterator = {
@@ -128,10 +131,10 @@ private[spark] class PartitionedSerializedPairBuffer[K, V](
128
131
var pos = 0
129
132
130
133
def writeNext (writer : BlockObjectWriter ): Unit = {
131
- val keyStart = metaBuffer.get(pos + KEY_START )
132
- val valueEnd = metaBuffer.get(pos + VAL_END )
134
+ val keyStart = getKeyStartPos(metaBuffer, pos )
135
+ val keyValLen = metaBuffer.get(pos + KEY_VAL_LEN )
133
136
pos += RECORD_SIZE
134
- kvBuffer.read(keyStart, writer, valueEnd - keyStart )
137
+ kvBuffer.read(keyStart, writer, keyValLen )
135
138
writer.recordWritten()
136
139
}
137
140
def nextPartition (): Int = metaBuffer.get(pos + PARTITION )
@@ -163,23 +166,26 @@ private[spark] class PartitionedSerializedPairBuffer[K, V](
163
166
private [spark] class OrderedInputStream (metaBuffer : IntBuffer , kvBuffer : ChainedBuffer )
164
167
extends InputStream {
165
168
169
+ import PartitionedSerializedPairBuffer ._
170
+
166
171
private var metaBufferPos = 0
167
172
private var kvBufferPos =
168
- if (metaBuffer.position > 0 ) metaBuffer.get(metaBufferPos + KEY_START ) else 0
173
+ if (metaBuffer.position > 0 ) getKeyStartPos(metaBuffer, metaBufferPos ) else 0
169
174
170
175
override def read (bytes : Array [Byte ]): Int = read(bytes, 0 , bytes.length)
171
176
172
177
override def read (bytes : Array [Byte ], offs : Int , len : Int ): Int = {
173
178
if (metaBufferPos >= metaBuffer.position) {
174
179
return - 1
175
180
}
176
- val bytesRemainingInRecord = metaBuffer.get(metaBufferPos + VAL_END ) - kvBufferPos
181
+ val bytesRemainingInRecord = (metaBuffer.get(metaBufferPos + KEY_VAL_LEN ) -
182
+ (kvBufferPos - getKeyStartPos(metaBuffer, metaBufferPos))).toInt
177
183
val toRead = math.min(bytesRemainingInRecord, len)
178
184
kvBuffer.read(kvBufferPos, bytes, offs, toRead)
179
185
if (toRead == bytesRemainingInRecord) {
180
186
metaBufferPos += RECORD_SIZE
181
187
if (metaBufferPos < metaBuffer.position) {
182
- kvBufferPos = metaBuffer.get(metaBufferPos + KEY_START )
188
+ kvBufferPos = getKeyStartPos(metaBuffer, metaBufferPos )
183
189
}
184
190
} else {
185
191
kvBufferPos += toRead
@@ -246,9 +252,14 @@ private[spark] class SerializedSortDataFormat extends SortDataFormat[Int, IntBuf
246
252
}
247
253
248
254
private [spark] object PartitionedSerializedPairBuffer {
249
- val KEY_START = 0
250
- val VAL_START = 1
251
- val VAL_END = 2
255
+ val KEY_START = 0 // keyStart, a long, gets split across two ints
256
+ val KEY_VAL_LEN = 2
252
257
val PARTITION = 3
253
- val RECORD_SIZE = Seq (KEY_START , VAL_START , VAL_END , PARTITION ).size // num ints of metadata
258
+ val RECORD_SIZE = PARTITION + 1 // num ints of metadata
259
+
260
+ def getKeyStartPos (metaBuffer : IntBuffer , metaBufferPos : Int ): Long = {
261
+ val lower32 = metaBuffer.get(metaBufferPos + KEY_START )
262
+ val upper32 = metaBuffer.get(metaBufferPos + KEY_START + 1 )
263
+ (upper32.toLong << 32 ) | (lower32 & 0xFFFFFFFFL)
264
+ }
254
265
}
0 commit comments