Golang实现词频统计

前端之家收集整理的这篇文章主要介绍了Golang实现词频统计前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。

本例使用golang实现词频统计。步骤:

(1)从文件中读取一篇文章

(2)统计词频,按单词出现的频率从大到小进行排序。

(3)写入到文件中。

注:任何非英文字母的符号均认为是单词分隔符(即等同于空格)。

效率:使用本程序统计一篇150W单词的文章,大约需要70ms.

1.核心代码

  1. package wordtest
  2.  
  3. import (
  4. "bytes"
  5. "fmt"
  6. "io/IoUtil"
  7. "os"
  8. "runtime"
  9. "sort"
  10. "strings"
  11. "time"
  12. )
  13.  
  14. //简单的词频统计任务
  15. func CountTestBase(inputFilePath string,outputFilePath string) {
  16. //时间开始点
  17. start := time.Now().UnixNano() / 1e6
  18.  
  19. //读取文件
  20. fileData,err := IoUtil.ReadFile(inputFilePath)
  21. CheckError(err,"read file")
  22. var fileText string = string(fileData)
  23.  
  24. //根据cpu核数新开协程
  25. newRountineCount := runtime.Numcpu()*2 - 1
  26. runtime.GOMAXPROCS(newRountineCount + 1)
  27. //切分文件
  28. parts := splitFileText(fileText,newRountineCount)
  29.  
  30. var ch chan map[string]int = make(chan map[string]int,newRountineCount)
  31. for i := 0; i < newRountineCount; i++ {
  32. go countTest(parts[i],ch)
  33. }
  34.  
  35. //主线程接收数据
  36. var totalWordsMap map[string]int = make(map[string]int,0)
  37. completeCount := 0
  38. for {
  39. receiveData := <-ch
  40. for k,v := range receiveData {
  41. totalWordsMap[strings.ToLower(k)] += v
  42. }
  43. completeCount++
  44.  
  45. if newRountineCount == completeCount {
  46. break
  47. }
  48. }
  49.  
  50. //添加进slice,并排序
  51. list := make(WordCountBeanList,0)
  52. for k,v := range totalWordsMap {
  53. list = append(list,NewWordCountBean(k,v))
  54. }
  55. sort.Sort(list)
  56. //时间结束点
  57. end := time.Now().UnixNano() / 1e6
  58. fmt.Printf("time consume:%dms\n",end-start)
  59.  
  60. //输出
  61. wordsCount := list.totalCount()
  62. var data bytes.Buffer
  63. data.WriteString(fmt.Sprintf("程序执行:%dms\n",end-start))
  64. data.WriteString(fmt.Sprintf("文章总单词数:%d\n\n",wordsCount))
  65. for _,v := range list {
  66. var percent float64 = 100.0 * float64(v.count) / float64(wordsCount)
  67. _,err := data.WriteString(fmt.Sprintf("%s: %d,%3.2f%%\n",v.word,v.count,percent))
  68. CheckError(err,"bytes.Buffer,WriteString")
  69. }
  70.  
  71. err = IoUtil.WriteFile(outputFilePath,[]byte(data.String()),os.ModePerm)
  72. CheckError(err,"IoUtil.WriteFile")
  73. }
  74.  
  75. func countTest(text string,ch chan map[string]int) {
  76. var wordMap map[string]int = make(map[string]int,0)
  77.  
  78. //按字母读取,除26个字母(大小写)之外的所有字符均认为是分隔符
  79. startIndex := 0
  80. letterStart := false
  81. for i,v := range text {
  82. if (v >= 65 && v <= 90) || (v >= 97 && v <= 122) {
  83. if !letterStart {
  84. letterStart = true
  85. startIndex = i
  86. }
  87. } else {
  88. if letterStart {
  89. wordMap[text[startIndex:i]]++
  90. letterStart = false
  91. }
  92. }
  93. }
  94.  
  95. //最后一个单词
  96. if letterStart {
  97. wordMap[text[startIndex:]]++
  98. }
  99. ch <- wordMap
  100. }
  101.  
  102. //将全文分成n段
  103. func splitFileText(fileText string,n int) []string {
  104. length := len(fileText)
  105. parts := make([]string,n)
  106.  
  107. lastPostion := 0
  108. for i := 0; i < n-1; i++ {
  109. position := length / n * (i + 1)
  110. for string(fileText[position]) != " " {
  111. position++
  112. }
  113.  
  114. parts[i] = fileText[lastPostion:position]
  115. lastPostion = position
  116. }
  117.  
  118. //最后一段
  119. parts[n-1] = fileText[lastPostion:]
  120. return parts
  121. }
  122.  
  123. func CheckError(err error,msg string) {
  124. if err != nil {
  125. panic(msg + "," + err.Error())
  126. }
  127. }
2.一个struct
  1. package wordtest
  2.  
  3. type WordCountBean struct {
  4. word string
  5. count int
  6. }
  7.  
  8. func NewWordCountBean(word string,count int) *WordCountBean {
  9. return &WordCountBean{word,count}
  10. }
  11.  
  12. type WordCountBeanList []*WordCountBean
  13.  
  14. func (list WordCountBeanList) Len() int {
  15. return len(list)
  16. }
  17.  
  18. func (list WordCountBeanList) Less(i,j int) bool {
  19. if list[i].count > list[j].count {
  20. return true
  21. } else if list[i].count < list[j].count {
  22. return false
  23. } else {
  24. return list[i].word < list[j].word
  25. }
  26. }
  27.  
  28. func (list WordCountBeanList) Swap(i,j int) {
  29. var temp *WordCountBean = list[i]
  30. list[i] = list[j]
  31. list[j] = temp
  32. }
  33.  
  34. func (list WordCountBeanList) totalCount() int {
  35. totalCount := 0
  36. for _,v := range list {
  37. totalCount += v.count
  38. }
  39.  
  40. return totalCount
  41. }
3.主函数
  1. package main
  2.  
  3. import (
  4. "WordsTest/wordtest"
  5. )
  6.  
  7. func main() {
  8. inputFilePath := "files/article.txt"
  9. outputFilePath := "files/result.txt"
  10.  
  11. wordtest.CountTestBase(inputFilePath,outputFilePath)
  12. }

猜你在找的Go相关文章