aboutsummaryrefslogtreecommitdiff
path: root/src/main
diff options
context:
space:
mode:
authorAlexander Yastrebov <yastrebov.alex@gmail.com>2024-01-23 20:49:28 +0100
committerGitHub <noreply@github.com>2024-01-23 20:49:28 +0100
commit4daeb94b048e074c2b80aac1074b68eb92285ea8 (patch)
treedcb34aeb870a49c0c45b04919b7c4bedfc4f9dc7 /src/main
parent464ba6209bb229f45e43693801ec5814d8815760 (diff)
Go implementation by AlexanderYastrebov (#298)
* Go implementation by AlexanderYastrebov This is a proof-of-concept to demonstrate non-java submission. It requires Docker with BuildKit plugin to build and export binary. Updates * #67 * #253 * Use collision-free id lookup * Use number of buckets greater than max number of keys
Diffstat (limited to 'src/main')
-rw-r--r--src/main/go/AlexanderYastrebov/Dockerfile22
-rw-r--r--src/main/go/AlexanderYastrebov/README.md58
-rw-r--r--src/main/go/AlexanderYastrebov/calc.go283
-rw-r--r--src/main/go/AlexanderYastrebov/calc_test.go86
-rw-r--r--src/main/go/AlexanderYastrebov/go.mod3
5 files changed, 452 insertions, 0 deletions
diff --git a/src/main/go/AlexanderYastrebov/Dockerfile b/src/main/go/AlexanderYastrebov/Dockerfile
new file mode 100644
index 0000000..a3b2806
--- /dev/null
+++ b/src/main/go/AlexanderYastrebov/Dockerfile
@@ -0,0 +1,22 @@
+#
+# Copyright 2023 The original authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM golang AS build-stage
+COPY . src/
+RUN cd src && go build .
+
+FROM scratch AS export-stage
+COPY --from=build-stage /go/src/1brc /
diff --git a/src/main/go/AlexanderYastrebov/README.md b/src/main/go/AlexanderYastrebov/README.md
new file mode 100644
index 0000000..dc01192
--- /dev/null
+++ b/src/main/go/AlexanderYastrebov/README.md
@@ -0,0 +1,58 @@
+# 1brc in go
+
+It uses Docker with BuildKit plugin to build and [export binary](https://docs.docker.com/engine/reference/commandline/build/#output) binary,
+see [prepare_AlexanderYastrebov.sh](../../../../prepare_AlexanderYastrebov.sh)
+and [calculate_average_AlexanderYastrebov.sh](../../../../calculate_average_AlexanderYastrebov.sh).
+
+Demo:
+```sh
+$ ./test.sh AlexanderYastrebov
+[+] Building 0.2s (9/9) FINISHED
+ => [internal] load .dockerignore 0.0s
+ => => transferring context: 2B 0.0s
+ => [internal] load build definition from Dockerfile 0.0s
+ => => transferring dockerfile: 172B 0.0s
+ => [internal] load metadata for docker.io/library/golang:latest 0.0s
+ => [internal] load build context 0.0s
+ => => transferring context: 145B 0.0s
+ => [build-stage 1/3] FROM docker.io/library/golang 0.0s
+ => CACHED [build-stage 2/3] COPY . src/ 0.0s
+ => CACHED [build-stage 3/3] RUN cd src && go build . 0.0s
+ => CACHED [export-stage 1/1] COPY --from=build-stage /go/src/1brc / 0.0s
+ => exporting to client directory 0.1s
+ => => copying files 2.03MB 0.0s
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-10000-unique-keys.txt
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-10.txt
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-1.txt
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-20.txt
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-2.txt
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-3.txt
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-boundaries.txt
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-complex-utf8.txt
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-dot.txt
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-shortest.txt
+Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-short.txt
+
+# Run once to setup the benchmark
+# ./create_measurements.sh 1000000000
+# mv measurements.txt measurements_1B.txt
+# ln -s measurements_1B.txt measurements.txt
+# ./calculate_average_baseline.sh > out_expected.txt
+
+$ wc -l measurements_1B.txt
+1000000000 measurements_1B.txt
+
+$ ./evaluate2.sh AlexanderYastrebov royvanrijn
+... 0.0s
+Benchmark 1: ./calculate_average_AlexanderYastrebov.sh 2>&1
+ Time (mean ± σ): 16.786 s ± 0.545 s [User: 56.030 s, System: 10.068 s]
+ Range (min … max): 15.918 s … 17.309 s 5 runs
+...
+Benchmark 1: ./calculate_average_royvanrijn.sh 2>&1
+ Time (mean ± σ): 16.731 s ± 0.190 s [User: 56.485 s, System: 10.279 s]
+ Range (min … max): 16.490 s … 16.951 s 5 runs
+
+Summary
+ AlexanderYastrebov: trimmed mean 16.901712789513336, raw times 16.69836470718,17.30911065018,16.83413600418,15.91787706218,17.17263765718
+ royvanrijn: trimmed mean 16.738037123633333, raw times 16.4900939703,16.9513459953,16.5794539913,16.8297746273,16.8048827523
+```
diff --git a/src/main/go/AlexanderYastrebov/calc.go b/src/main/go/AlexanderYastrebov/calc.go
new file mode 100644
index 0000000..149d38d
--- /dev/null
+++ b/src/main/go/AlexanderYastrebov/calc.go
@@ -0,0 +1,283 @@
+package main
+
+import (
+ "bytes"
+ "fmt"
+ "log"
+ "math"
+ "os"
+ "runtime"
+ "sort"
+ "sync"
+ "syscall"
+)
+
+type measurement struct {
+ min, max, sum, count int64
+}
+
+func main() {
+ if len(os.Args) != 2 {
+ log.Fatalf("Missing measurements filename")
+ }
+
+ measurements := processFile(os.Args[1])
+
+ ids := make([]string, 0, len(measurements))
+ for id := range measurements {
+ ids = append(ids, id)
+ }
+ sort.Strings(ids)
+
+ fmt.Print("{")
+ for i, id := range ids {
+ if i > 0 {
+ fmt.Print(", ")
+ }
+ m := measurements[id]
+ fmt.Printf("%s=%.1f/%.1f/%.1f", id, round(float64(m.min)/10.0), round(float64(m.sum)/10.0/float64(m.count)), round(float64(m.max)/10.0))
+ }
+ fmt.Println("}")
+}
+
+func processFile(filename string) map[string]*measurement {
+ f, err := os.Open(filename)
+ if err != nil {
+ log.Fatalf("Open: %v", err)
+ }
+ defer f.Close()
+
+ fi, err := f.Stat()
+ if err != nil {
+ log.Fatalf("Stat: %v", err)
+ }
+
+ size := fi.Size()
+ if size <= 0 || size != int64(int(size)) {
+ log.Fatalf("Invalid file size: %d", size)
+ }
+
+ data, err := syscall.Mmap(int(f.Fd()), 0, int(size), syscall.PROT_READ, syscall.MAP_SHARED)
+ if err != nil {
+ log.Fatalf("Mmap: %v", err)
+ }
+
+ defer func() {
+ if err := syscall.Munmap(data); err != nil {
+ log.Fatalf("Munmap: %v", err)
+ }
+ }()
+
+ return process(data)
+}
+
+func process(data []byte) map[string]*measurement {
+ nChunks := runtime.NumCPU()
+
+ chunkSize := len(data) / nChunks
+ if chunkSize == 0 {
+ chunkSize = len(data)
+ }
+
+ chunks := make([]int, 0, nChunks)
+ offset := 0
+ for offset < len(data) {
+ offset += chunkSize
+ if offset >= len(data) {
+ chunks = append(chunks, len(data))
+ break
+ }
+
+ nlPos := bytes.IndexByte(data[offset:], '\n')
+ if nlPos == -1 {
+ chunks = append(chunks, len(data))
+ break
+ } else {
+ offset += nlPos + 1
+ chunks = append(chunks, offset)
+ }
+ }
+
+ var wg sync.WaitGroup
+ wg.Add(len(chunks))
+
+ results := make([]map[string]*measurement, len(chunks))
+ start := 0
+ for i, chunk := range chunks {
+ go func(data []byte, i int) {
+ results[i] = processChunk(data)
+ wg.Done()
+ }(data[start:chunk], i)
+ start = chunk
+ }
+ wg.Wait()
+
+ measurements := make(map[string]*measurement)
+ for _, r := range results {
+ for id, rm := range r {
+ m := measurements[id]
+ if m == nil {
+ measurements[id] = rm
+ } else {
+ m.min = min(m.min, rm.min)
+ m.max = max(m.max, rm.max)
+ m.sum += rm.sum
+ m.count += rm.count
+ }
+ }
+ }
+ return measurements
+}
+
+func processChunk(data []byte) map[string]*measurement {
+ // Use fixed size linear probe lookup table
+ const (
+ // use power of 2 for fast modulo calculation,
+ // should be larger than max number of keys which is 10_000
+ entriesSize = 1 << 14
+
+ // use FNV-1a hash
+ fnv1aOffset64 = 14695981039346656037
+ fnv1aPrime64 = 1099511628211
+ )
+
+ type entry struct {
+ m measurement
+ hash uint64
+ vlen int
+ value [128]byte // use power of 2 > 100 for alignment
+ }
+ entries := make([]entry, entriesSize)
+ entriesCount := 0
+
+ // keep short and inlinable
+ getMeasurement := func(hash uint64, value []byte) *measurement {
+ i := hash & uint64(entriesSize-1)
+ entry := &entries[i]
+
+ // bytes.Equal could be commented to speedup assuming no hash collisions
+ for entry.vlen > 0 && !(entry.hash == hash && bytes.Equal(entry.value[:entry.vlen], value)) {
+ i = (i + 1) & uint64(entriesSize-1)
+ entry = &entries[i]
+ }
+
+ if entry.vlen == 0 {
+ entry.hash = hash
+ entry.vlen = copy(entry.value[:], value)
+ entriesCount++
+ }
+ return &entry.m
+ }
+
+ // assume valid input
+ for len(data) > 0 {
+
+ idHash := uint64(fnv1aOffset64)
+ semiPos := 0
+ for i, b := range data {
+ if b == ';' {
+ semiPos = i
+ break
+ }
+
+ // calculate FNV-1a hash
+ idHash ^= uint64(b)
+ idHash *= fnv1aPrime64
+ }
+
+ idData := data[:semiPos]
+
+ data = data[semiPos+1:]
+
+ var temp int64
+ // parseNumber
+ {
+ negative := data[0] == '-'
+ if negative {
+ data = data[1:]
+ }
+
+ _ = data[3]
+ if data[1] == '.' {
+ // 1.2\n
+ temp = int64(data[0])*10 + int64(data[2]) - '0'*(10+1)
+ data = data[4:]
+ // 12.3\n
+ } else {
+ _ = data[4]
+ temp = int64(data[0])*100 + int64(data[1])*10 + int64(data[3]) - '0'*(100+10+1)
+ data = data[5:]
+ }
+
+ if negative {
+ temp = -temp
+ }
+ }
+
+ m := getMeasurement(idHash, idData)
+ if m.count == 0 {
+ m.min = temp
+ m.max = temp
+ m.sum = temp
+ m.count = 1
+ } else {
+ m.min = min(m.min, temp)
+ m.max = max(m.max, temp)
+ m.sum += temp
+ m.count++
+ }
+ }
+
+ result := make(map[string]*measurement, entriesCount)
+ for i := range entries {
+ entry := &entries[i]
+ if entry.m.count > 0 {
+ result[string(entry.value[:entry.vlen])] = &entry.m
+ }
+ }
+ return result
+}
+
+func round(x float64) float64 {
+ return roundJava(x*10.0) / 10.0
+}
+
+// roundJava returns the closest integer to the argument, with ties
+// rounding to positive infinity, see java's Math.round
+func roundJava(x float64) float64 {
+ t := math.Trunc(x)
+ if x < 0.0 && t-x == 0.5 {
+ //return t
+ } else if math.Abs(x-t) >= 0.5 {
+ t += math.Copysign(1, x)
+ }
+
+ if t == 0 { // check -0
+ return 0.0
+ }
+ return t
+}
+
+// parseNumber reads decimal number that matches "^-?[0-9]{1,2}[.][0-9]" pattern,
+// e.g.: -12.3, -3.4, 5.6, 78.9 and return the value*10, i.e. -123, -34, 56, 789.
+func parseNumber(data []byte) int64 {
+ negative := data[0] == '-'
+ if negative {
+ data = data[1:]
+ }
+
+ var result int64
+ switch len(data) {
+ // 1.2
+ case 3:
+ result = int64(data[0])*10 + int64(data[2]) - '0'*(10+1)
+ // 12.3
+ case 4:
+ result = int64(data[0])*100 + int64(data[1])*10 + int64(data[3]) - '0'*(100+10+1)
+ }
+
+ if negative {
+ return -result
+ }
+ return result
+}
diff --git a/src/main/go/AlexanderYastrebov/calc_test.go b/src/main/go/AlexanderYastrebov/calc_test.go
new file mode 100644
index 0000000..db7e27a
--- /dev/null
+++ b/src/main/go/AlexanderYastrebov/calc_test.go
@@ -0,0 +1,86 @@
+package main
+
+import (
+ "fmt"
+ "os"
+ "testing"
+)
+
+func TestRoundJava(t *testing.T) {
+ for _, tc := range []struct {
+ value float64
+ expected string
+ }{
+ {value: -1.5, expected: "-1.0"},
+ {value: -1.0, expected: "-1.0"},
+ {value: -0.7, expected: "-1.0"},
+ {value: -0.5, expected: "0.0"},
+ {value: -0.3, expected: "0.0"},
+ {value: 0.0, expected: "0.0"},
+ {value: 0.3, expected: "0.0"},
+ {value: 0.5, expected: "1.0"},
+ {value: 0.7, expected: "1.0"},
+ {value: 1.0, expected: "1.0"},
+ {value: 1.5, expected: "2.0"},
+ } {
+ if rounded := roundJava(tc.value); fmt.Sprintf("%.1f", rounded) != tc.expected {
+ t.Errorf("Wrong rounding of %v, expected: %s, got: %.1f", tc.value, tc.expected, rounded)
+ }
+ }
+}
+
+func TestParseNumber(t *testing.T) {
+ for _, tc := range []struct {
+ value string
+ expected string
+ }{
+ {value: "-99.9", expected: "-999"},
+ {value: "-12.3", expected: "-123"},
+ {value: "-1.5", expected: "-15"},
+ {value: "-1.0", expected: "-10"},
+ {value: "0.0", expected: "0"},
+ {value: "0.3", expected: "3"},
+ {value: "12.3", expected: "123"},
+ {value: "99.9", expected: "999"},
+ } {
+ if number := parseNumber([]byte(tc.value)); fmt.Sprintf("%d", number) != tc.expected {
+ t.Errorf("Wrong parsing of %v, expected: %s, got: %d", tc.value, tc.expected, number)
+ }
+ }
+}
+
+var parseNumberSink int64
+
+func BenchmarkParseNumber(b *testing.B) {
+ data1 := []byte("1.2")
+ data2 := []byte("-12.3")
+
+ for i := 0; i < b.N; i++ {
+ parseNumberSink = parseNumber(data1) + parseNumber(data2)
+ }
+}
+
+func BenchmarkProcess(b *testing.B) {
+ // $ ./create_measurements.sh 1000000 && mv measurements.txt measurements-1e6.txt
+ // Created file with 1,000,000 measurements in 514 ms
+ const filename = "../../../../measurements-1e6.txt"
+
+ data, err := os.ReadFile(filename)
+ if err != nil {
+ b.Fatal(err)
+ }
+
+ measurements := process(data)
+ rows := int64(0)
+ for _, m := range measurements {
+ rows += m.count
+ }
+
+ b.ReportAllocs()
+ b.ResetTimer()
+ b.ReportMetric(float64(rows), "rows/op")
+
+ for i := 0; i < b.N; i++ {
+ process(data)
+ }
+}
diff --git a/src/main/go/AlexanderYastrebov/go.mod b/src/main/go/AlexanderYastrebov/go.mod
new file mode 100644
index 0000000..08f5bd1
--- /dev/null
+++ b/src/main/go/AlexanderYastrebov/go.mod
@@ -0,0 +1,3 @@
+module github.com/AlexanderYastrebov/1brc
+
+go 1.21.5