Golang Lesen Sie eine große Datei
1package main
2
3import (
4 "bufio"
5 "fmt"
6 "log"
7 "os"
8 "strconv"
9 "strings"
10 "sync"
11 "sync/atomic"
12 "time"
13)
14
15func main() {
16 start := time.Now()
17 file, err := os.Open(os.Args[1])
18 if err != nil {
19 log.Fatal(err)
20 }
21 defer file.Close()
22
23 commonName := ""
24 commonCount := 0
25 scanner := bufio.NewScanner(file)
26 nameMap := make(map[string]int)
27 dateMap := make(map[int]int)
28
29 namesCounted := false
30 namesCount := 0
31 fileLineCount := int64(0)
32
33 type entry struct {
34 firstName string
35 name string
36 date int
37 }
38
39 linesChunkLen := 64 * 1024
40 linesChunkPoolAllocated := int64(0)
41 linesPool := sync.Pool{New: func() interface{} {
42 lines := make([]string, 0, linesChunkLen)
43 atomic.AddInt64(&linesChunkPoolAllocated, 1)
44 return lines
45 }}
46 lines := linesPool.Get().([]string)[:0]
47
48 entriesPoolAllocated := int64(0)
49 entriesPool := sync.Pool{New: func() interface{} {
50 entries := make([]entry, 0, linesChunkLen)
51 atomic.AddInt64(&entriesPoolAllocated, 1)
52 return entries
53 }}
54 mutex := &sync.Mutex{}
55 wg := sync.WaitGroup{}
56
57 scanner.Scan()
58 for {
59 lines = append(lines, scanner.Text())
60 willScan := scanner.Scan()
61 if len(lines) == linesChunkLen || !willScan {
62 linesToProcess := lines
63 wg.Add(len(linesToProcess))
64 go func() {
65 atomic.AddInt64(&fileLineCount, int64(len(linesToProcess)))
66 entries := entriesPool.Get().([]entry)[:0]
67 for _, text := range linesToProcess {
68 // get all the names
69 entry := entry{}
70 split := strings.SplitN(text, "|", 9)
71 entry.name = strings.TrimSpace(split[7])
72
73 // extract first names
74 if entry.name != "" {
75 startOfName := strings.Index(entry.name, ", ") + 2
76 if endOfName := strings.Index(entry.name[startOfName:], " "); endOfName < 0 {
77 entry.firstName = entry.name[startOfName:]
78 } else {
79 entry.firstName = entry.name[startOfName : startOfName+endOfName]
80 }
81 if cs := strings.Index(entry.firstName, ","); cs > 0 {
82 entry.firstName = entry.firstName[:cs]
83 }
84 }
85 // extract dates
86 entry.date, _ = strconv.Atoi(split[4][:6])
87 entries = append(entries, entry)
88 }
89 linesPool.Put(linesToProcess)
90 mutex.Lock()
91 for _, entry := range entries {
92 if len(entry.firstName) != 0 {
93 nameCount := nameMap[entry.firstName] + 1
94 nameMap[entry.firstName] = nameCount
95 if nameCount > commonCount {
96 commonCount = nameCount
97 commonName = entry.firstName
98 }
99 }
100 if namesCounted == false {
101 if namesCount == 0 {
102 fmt.Printf("Name: %s at index: %v\n", entry.name, 0)
103 } else if namesCount == 432 {
104 fmt.Printf("Name: %s at index: %v\n", entry.name, 432)
105 } else if namesCount == 43243 {
106 fmt.Printf("Name: %s at index: %v\n", entry.name, 43243)
107 namesCounted = true
108 }
109 namesCount++
110 }
111 dateMap[entry.date]++
112 }
113 mutex.Unlock()
114 entriesPool.Put(entries)
115 wg.Add(-len(entries))
116 }()
117 lines = linesPool.Get().([]string)[:0]
118 }
119 if !willScan {
120 break
121 }
122 }
123 wg.Wait()
124
125 // report c2: names at index
126 fmt.Printf("Name time: %v\n", time.Since(start))
127
128 // report c1: total number of lines
129 fmt.Printf("Total file line count: %v\n", fileLineCount)
130 fmt.Printf("Line count time: %v\n", time.Since(start))
131
132 // report c3: donation frequency
133 for k, v := range dateMap {
134 fmt.Printf("Donations per month and year: %v and donation ncount: %v\n", k, v)
135 }
136 fmt.Printf("Donations time: %v\n", time.Since(start))
137
138 // report c4: most common firstName
139 fmt.Printf("The most common first name is: %s and it occurs: %v times.\n", commonName, commonCount)
140 fmt.Printf("Most common name time: %v\n", time.Since(start))
141}
Tired Tamarin