44 "log"
55 "path/filepath"
66 "sort"
7+ "strings"
78 "sync"
89 "unicode/utf8"
910
@@ -48,6 +49,10 @@ const (
4849 // RenameAnalysisSetSizeLimit is the maximum number of added + removed files for
4950 // RenameAnalysisMaxCandidates to be active; the bigger numbers set it to 1.
5051 RenameAnalysisSetSizeLimit = 1000
52+
53+ // RenameAnalysisByteDiffSizeThreshold is the maximum size of each of the compared parts
54+ // to be diff-ed on byte level.
55+ RenameAnalysisByteDiffSizeThreshold = 100000
5156)
5257
5358// Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
@@ -367,12 +372,12 @@ func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
367372}
368373
369374func (ra * RenameAnalysis ) blobsAreClose (blob1 * CachedBlob , blob2 * CachedBlob ) (bool , error ) {
375+ cleanReturn := false
370376 defer func () {
371- if err := recover (); err != nil {
377+ if ! cleanReturn {
372378 log .Println ()
373379 log .Println (blob1 .Hash .String ())
374380 log .Println (blob2 .Hash .String ())
375- panic (err )
376381 }
377382 }()
378383 _ , err1 := blob1 .CountLines ()
@@ -382,6 +387,7 @@ func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (b
382387 bsdifflen := DiffBytes (blob1 .Data , blob2 .Data )
383388 delta := int ((int64 (bsdifflen ) * 100 ) / internal .Max64 (
384389 internal .Min64 (blob1 .Size , blob2 .Size ), 1 ))
390+ cleanReturn = true
385391 return 100 - delta >= ra .SimilarityThreshold , nil
386392 }
387393 src , dst := string (blob1 .Data ), string (blob2 .Data )
@@ -390,72 +396,104 @@ func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (b
390396 // compute the line-by-line diff, then the char-level diffs of the del-ins blocks
391397 // yes, this algorithm is greedy and not exact
392398 dmp := diffmatchpatch .New ()
393- srcLines , dstLines , lines := dmp .DiffLinesToRunes (src , dst )
394- diffs := dmp .DiffMainRunes (srcLines , dstLines , false )
399+ srcLineRunes , dstLineRunes , _ := dmp .DiffLinesToRunes (src , dst )
400+ // the third returned value, []string, is the mapping from runes to lines
401+ // we cannot use it because it is approximate and has string collisions
402+ // that is, the mapping is wrong for huge files
403+ diffs := dmp .DiffMainRunes (srcLineRunes , dstLineRunes , false )
404+
405+ srcPositions := calcLinePositions (src )
406+ dstPositions := calcLinePositions (dst )
395407 var common , posSrc , prevPosSrc , posDst int
396408 possibleDelInsBlock := false
397409 for _ , edit := range diffs {
398410 switch edit .Type {
399411 case diffmatchpatch .DiffDelete :
400412 possibleDelInsBlock = true
401413 prevPosSrc = posSrc
402- for _ , lineno := range edit .Text {
403- posSrc += len (lines [lineno ])
404- }
414+ posSrc += utf8 .RuneCountInString (edit .Text )
405415 case diffmatchpatch .DiffInsert :
406- nextPosDst := posDst
407- for _ , lineno := range edit .Text {
408- nextPosDst += len (lines [lineno ])
409- }
416+ nextPosDst := posDst + utf8 .RuneCountInString (edit .Text )
410417 if possibleDelInsBlock {
411418 possibleDelInsBlock = false
412- localDmp := diffmatchpatch .New ()
413- localSrc := src [prevPosSrc :posSrc ]
414- localDst := dst [posDst :nextPosDst ]
415- localDiffs := localDmp .DiffMainRunes ([]rune (localSrc ), []rune (localDst ), false )
416- for _ , localEdit := range localDiffs {
417- if localEdit .Type == diffmatchpatch .DiffEqual {
418- common += utf8 .RuneCountInString (localEdit .Text )
419+ if internal .Max (srcPositions [posSrc ]- srcPositions [prevPosSrc ],
420+ dstPositions [nextPosDst ]- dstPositions [posDst ]) < RenameAnalysisByteDiffSizeThreshold {
421+ localDmp := diffmatchpatch .New ()
422+ localSrc := src [srcPositions [prevPosSrc ]:srcPositions [posSrc ]]
423+ localDst := dst [dstPositions [posDst ]:dstPositions [nextPosDst ]]
424+ localDiffs := localDmp .DiffMainRunes (
425+ strToLiteralRunes (localSrc ), strToLiteralRunes (localDst ), false )
426+ for _ , localEdit := range localDiffs {
427+ if localEdit .Type == diffmatchpatch .DiffEqual {
428+ common += utf8 .RuneCountInString (localEdit .Text )
429+ }
419430 }
420431 }
421432 }
422433 posDst = nextPosDst
423434 case diffmatchpatch .DiffEqual :
424435 possibleDelInsBlock = false
425- for _ , lineno := range edit .Text {
426- common += utf8 . RuneCountInString ( lines [ lineno ])
427- step := len ( lines [ lineno ])
428- posSrc += step
429- posDst += step
436+ step := utf8 . RuneCountInString ( edit .Text )
437+ // for i := range edit.Text does *not* work
438+ // idk why, but `i` appears to be bigger than the number of runes
439+ for i := 0 ; i < step ; i ++ {
440+ common += srcPositions [ posSrc + i + 1 ] - srcPositions [ posSrc + i ]
430441 }
442+ posSrc += step
443+ posDst += step
431444 }
432445 if possibleDelInsBlock {
433446 continue
434447 }
435448 // supposing that the rest of the lines are the same (they are not - too optimistic),
436449 // estimate the maximum similarity and exit the loop if it lower than our threshold
437450 var srcPendingSize , dstPendingSize int
438- if posSrc < len (src ) {
439- srcPendingSize = utf8 .RuneCountInString (src [posSrc :])
440- }
441- if posDst < len (dst ) {
442- dstPendingSize = utf8 .RuneCountInString (dst [posDst :])
443- }
451+ srcPendingSize = len (src ) - srcPositions [posSrc ]
452+ dstPendingSize = len (dst ) - dstPositions [posDst ]
444453 maxCommon := common + internal .Min (srcPendingSize , dstPendingSize )
445454 similarity := (maxCommon * 100 ) / maxSize
446455 if similarity < ra .SimilarityThreshold {
456+ cleanReturn = true
447457 return false , nil
448458 }
449459 similarity = (common * 100 ) / maxSize
450460 if similarity >= ra .SimilarityThreshold {
461+ cleanReturn = true
451462 return true , nil
452463 }
453464 }
454465 // the very last "overly optimistic" estimate was actually precise, so since we are still here
455466 // the blobs are similar
467+ cleanReturn = true
456468 return true , nil
457469}
458470
471+ func calcLinePositions (text string ) []int {
472+ if text == "" {
473+ return []int {0 }
474+ }
475+ lines := strings .Split (text , "\n " )
476+ positions := make ([]int , len (lines )+ 1 )
477+ accum := 0
478+ for i , l := range lines {
479+ positions [i ] = accum
480+ accum += len (l ) + 1 // +1 for \n
481+ }
482+ if len (lines ) > 0 && lines [len (lines )- 1 ] != "\n " {
483+ accum --
484+ }
485+ positions [len (lines )] = accum
486+ return positions
487+ }
488+
489+ func strToLiteralRunes (s string ) []rune {
490+ lrunes := make ([]rune , len (s ))
491+ for i , b := range []byte (s ) {
492+ lrunes [i ] = rune (b )
493+ }
494+ return lrunes
495+ }
496+
459497type sortableChange struct {
460498 change * object.Change
461499 hash plumbing.Hash
0 commit comments