121 lines
		
	
	
		
			2.2 KiB
		
	
	
	
		
			Go
		
	
	
	
			
		
		
	
	
			121 lines
		
	
	
		
			2.2 KiB
		
	
	
	
		
			Go
		
	
	
	
| // Copyright 2015 Huan Du. All rights reserved.
 | |
| // Licensed under the MIT license that can be found in the LICENSE file.
 | |
| 
 | |
| package xstrings
 | |
| 
 | |
| import (
 | |
| 	"unicode"
 | |
| 	"unicode/utf8"
 | |
| )
 | |
| 
 | |
| // Len returns str's utf8 rune length.
 | |
| func Len(str string) int {
 | |
| 	return utf8.RuneCountInString(str)
 | |
| }
 | |
| 
 | |
| // WordCount returns number of words in a string.
 | |
| //
 | |
| // Word is defined as a locale dependent string containing alphabetic characters,
 | |
| // which may also contain but not start with `'` and `-` characters.
 | |
| func WordCount(str string) int {
 | |
| 	var r rune
 | |
| 	var size, n int
 | |
| 
 | |
| 	inWord := false
 | |
| 
 | |
| 	for len(str) > 0 {
 | |
| 		r, size = utf8.DecodeRuneInString(str)
 | |
| 
 | |
| 		switch {
 | |
| 		case isAlphabet(r):
 | |
| 			if !inWord {
 | |
| 				inWord = true
 | |
| 				n++
 | |
| 			}
 | |
| 
 | |
| 		case inWord && (r == '\'' || r == '-'):
 | |
| 			// Still in word.
 | |
| 
 | |
| 		default:
 | |
| 			inWord = false
 | |
| 		}
 | |
| 
 | |
| 		str = str[size:]
 | |
| 	}
 | |
| 
 | |
| 	return n
 | |
| }
 | |
| 
 | |
| const minCJKCharacter = '\u3400'
 | |
| 
 | |
| // Checks r is a letter but not CJK character.
 | |
| func isAlphabet(r rune) bool {
 | |
| 	if !unicode.IsLetter(r) {
 | |
| 		return false
 | |
| 	}
 | |
| 
 | |
| 	switch {
 | |
| 	// Quick check for non-CJK character.
 | |
| 	case r < minCJKCharacter:
 | |
| 		return true
 | |
| 
 | |
| 	// Common CJK characters.
 | |
| 	case r >= '\u4E00' && r <= '\u9FCC':
 | |
| 		return false
 | |
| 
 | |
| 	// Rare CJK characters.
 | |
| 	case r >= '\u3400' && r <= '\u4D85':
 | |
| 		return false
 | |
| 
 | |
| 	// Rare and historic CJK characters.
 | |
| 	case r >= '\U00020000' && r <= '\U0002B81D':
 | |
| 		return false
 | |
| 	}
 | |
| 
 | |
| 	return true
 | |
| }
 | |
| 
 | |
| // Width returns string width in monotype font.
 | |
| // Multi-byte characters are usually twice the width of single byte characters.
 | |
| //
 | |
| // Algorithm comes from `mb_strwidth` in PHP.
 | |
| // http://php.net/manual/en/function.mb-strwidth.php
 | |
| func Width(str string) int {
 | |
| 	var r rune
 | |
| 	var size, n int
 | |
| 
 | |
| 	for len(str) > 0 {
 | |
| 		r, size = utf8.DecodeRuneInString(str)
 | |
| 		n += RuneWidth(r)
 | |
| 		str = str[size:]
 | |
| 	}
 | |
| 
 | |
| 	return n
 | |
| }
 | |
| 
 | |
| // RuneWidth returns character width in monotype font.
 | |
| // Multi-byte characters are usually twice the width of single byte characters.
 | |
| //
 | |
| // Algorithm comes from `mb_strwidth` in PHP.
 | |
| // http://php.net/manual/en/function.mb-strwidth.php
 | |
| func RuneWidth(r rune) int {
 | |
| 	switch {
 | |
| 	case r == utf8.RuneError || r < '\x20':
 | |
| 		return 0
 | |
| 
 | |
| 	case '\x20' <= r && r < '\u2000':
 | |
| 		return 1
 | |
| 
 | |
| 	case '\u2000' <= r && r < '\uFF61':
 | |
| 		return 2
 | |
| 
 | |
| 	case '\uFF61' <= r && r < '\uFFA0':
 | |
| 		return 1
 | |
| 
 | |
| 	case '\uFFA0' <= r:
 | |
| 		return 2
 | |
| 	}
 | |
| 
 | |
| 	return 0
 | |
| }
 |