Merge different languages for language stats (#24900)
Fix #24896 If users set different languages by `linguist-language`, the `stats` map could be: `java: 100, Java: 200`. Language stats are stored as case-insensitive in database and there is a unique key. So, the different language names should be merged to one unique name: `Java: 300`
This commit is contained in:
		
							parent
							
								
									63d5e762d8
								
							
						
					
					
						commit
						395bb33e4c
					
				|  | @ -3,7 +3,46 @@ | |||
| 
 | ||||
| package git | ||||
| 
 | ||||
| import ( | ||||
| 	"strings" | ||||
| 	"unicode" | ||||
| ) | ||||
| 
 | ||||
| const ( | ||||
| 	fileSizeLimit int64 = 16 * 1024   // 16 KiB
 | ||||
| 	bigFileSize   int64 = 1024 * 1024 // 1 MiB
 | ||||
| ) | ||||
| 
 | ||||
| // mergeLanguageStats mergers language names with different cases. The name with most upper case letters is used.
 | ||||
| func mergeLanguageStats(stats map[string]int64) map[string]int64 { | ||||
| 	names := map[string]struct { | ||||
| 		uniqueName string | ||||
| 		upperCount int | ||||
| 	}{} | ||||
| 
 | ||||
| 	countUpper := func(s string) (count int) { | ||||
| 		for _, r := range s { | ||||
| 			if unicode.IsUpper(r) { | ||||
| 				count++ | ||||
| 			} | ||||
| 		} | ||||
| 		return count | ||||
| 	} | ||||
| 
 | ||||
| 	for name := range stats { | ||||
| 		cnt := countUpper(name) | ||||
| 		lower := strings.ToLower(name) | ||||
| 		if cnt >= names[lower].upperCount { | ||||
| 			names[lower] = struct { | ||||
| 				uniqueName string | ||||
| 				upperCount int | ||||
| 			}{uniqueName: name, upperCount: cnt} | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	res := make(map[string]int64, len(names)) | ||||
| 	for name, num := range stats { | ||||
| 		res[names[strings.ToLower(name)].uniqueName] += num | ||||
| 	} | ||||
| 	return res | ||||
| } | ||||
|  |  | |||
|  | @ -156,7 +156,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err | |||
| 		sizes[firstExcludedLanguage] = firstExcludedLanguageSize | ||||
| 	} | ||||
| 
 | ||||
| 	return sizes, nil | ||||
| 	return mergeLanguageStats(sizes), nil | ||||
| } | ||||
| 
 | ||||
| func readFile(f *object.File, limit int64) ([]byte, error) { | ||||
|  |  | |||
|  | @ -180,7 +180,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err | |||
| 		// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
 | ||||
| 		// - eg. do the all the detection tests using filename first before reading content.
 | ||||
| 		language := analyze.GetCodeLanguage(f.Name(), content) | ||||
| 		if language == enry.OtherLanguage || language == "" { | ||||
| 		if language == "" { | ||||
| 			continue | ||||
| 		} | ||||
| 
 | ||||
|  | @ -192,8 +192,8 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err | |||
| 
 | ||||
| 		included, checked := includedLanguage[language] | ||||
| 		if !checked { | ||||
| 			langtype := enry.GetLanguageType(language) | ||||
| 			included = langtype == enry.Programming || langtype == enry.Markup | ||||
| 			langType := enry.GetLanguageType(language) | ||||
| 			included = langType == enry.Programming || langType == enry.Markup | ||||
| 			includedLanguage[language] = included | ||||
| 		} | ||||
| 		if included { | ||||
|  | @ -210,7 +210,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err | |||
| 		sizes[firstExcludedLanguage] = firstExcludedLanguageSize | ||||
| 	} | ||||
| 
 | ||||
| 	return sizes, nil | ||||
| 	return mergeLanguageStats(sizes), nil | ||||
| } | ||||
| 
 | ||||
| func discardFull(rd *bufio.Reader, discard int64) error { | ||||
|  |  | |||
|  | @ -30,3 +30,17 @@ func TestRepository_GetLanguageStats(t *testing.T) { | |||
| 		"Java":   112, | ||||
| 	}, stats) | ||||
| } | ||||
| 
 | ||||
| func TestMergeLanguageStats(t *testing.T) { | ||||
| 	assert.EqualValues(t, map[string]int64{ | ||||
| 		"PHP":    1, | ||||
| 		"python": 10, | ||||
| 		"JAVA":   700, | ||||
| 	}, mergeLanguageStats(map[string]int64{ | ||||
| 		"PHP":    1, | ||||
| 		"python": 10, | ||||
| 		"Java":   100, | ||||
| 		"java":   200, | ||||
| 		"JAVA":   400, | ||||
| 	})) | ||||
| } | ||||
|  |  | |||
|  | @ -10,7 +10,7 @@ import ( | |||
| 
 | ||||
| // FallbackErrorf is the last chance to show an error if the logger has internal errors
 | ||||
| func FallbackErrorf(format string, args ...any) { | ||||
| 	_, _ = fmt.Fprintf(os.Stderr, format+"\n", args) | ||||
| 	_, _ = fmt.Fprintf(os.Stderr, format+"\n", args...) | ||||
| } | ||||
| 
 | ||||
| func GetLevel() Level { | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue