Go Performance Fundamentals

Go is known for its excellent performance, but understanding how to optimize your code is crucial for building high-performance applications. Let's explore the key areas of Go performance optimization.

Measuring Performance

Benchmarking

package main

import (
    "testing"
    "strings"
)

// Benchmark string concatenation methods
func BenchmarkStringConcat(b *testing.B) {
    for i := 0; i < b.N; i++ {
        result := ""
        for j := 0; j < 1000; j++ {
            result += "a"
        }
    }
}

func BenchmarkStringBuilder(b *testing.B) {
    for i := 0; i < b.N; i++ {
        var builder strings.Builder
        for j := 0; j < 1000; j++ {
            builder.WriteString("a")
        }
        _ = builder.String()
    }
}

// Run benchmarks
// go test -bench=.

Profiling

import (
    "os"
    "runtime/pprof"
)

func main() {
    // CPU profiling
    f, err := os.Create("cpu.prof")
    if err != nil {
        log.Fatal(err)
    }
    defer f.Close()

    pprof.StartCPUProfile(f)
    defer pprof.StopCPUProfile()

    // Your code here
    doWork()
}

// Memory profiling
func profileMemory() {
    f, err := os.Create("mem.prof")
    if err != nil {
        log.Fatal(err)
    }
    defer f.Close()

    runtime.GC() // Get accurate memory stats
    pprof.WriteHeapProfile(f)
}

Memory Optimization

1. Avoid Unnecessary Allocations

// Bad - creates new slice on each call
func processData(data []int) []int {
    result := []int{} // This allocates!
    for _, v := range data {
        if v > 0 {
            result = append(result, v*2)
        }
    }
    return result
}

// Good - pre-allocate with known capacity
func processData(data []int) []int {
    result := make([]int, 0, len(data)) // Pre-allocate
    for _, v := range data {
        if v > 0 {
            result = append(result, v*2)
        }
    }
    return result
}

// Even better - reuse slice
func processDataReuse(data []int, result []int) []int {
    result = result[:0] // Reset length, keep capacity
    for _, v := range data {
        if v > 0 {
            result = append(result, v*2)
        }
    }
    return result
}

2. String Optimization

// Bad - creates many temporary strings
func buildString(parts []string) string {
    result := ""
    for _, part := range parts {
        result += part + " "
    }
    return result
}

// Good - use strings.Builder
func buildString(parts []string) string {
    var builder strings.Builder
    builder.Grow(len(parts) * 10) // Pre-allocate capacity
    for i, part := range parts {
        if i > 0 {
            builder.WriteString(" ")
        }
        builder.WriteString(part)
    }
    return builder.String()
}

3. Avoid Interface When Possible

// Bad - interface{} causes boxing
func processAny(data []interface{}) {
    for _, v := range data {
        // Type assertion overhead
        if str, ok := v.(string); ok {
            fmt.Println(str)
        }
    }
}

// Good - use generics or specific types
func processStrings(data []string) {
    for _, str := range data {
        fmt.Println(str)
    }
}

Concurrency Optimization

1. Goroutine Pool Pattern

type WorkerPool struct {
    workers    int
    jobQueue   chan Job
    resultChan chan Result
    wg         sync.WaitGroup
}

func NewWorkerPool(workers int) *WorkerPool {
    return &WorkerPool{
        workers:    workers,
        jobQueue:   make(chan Job, workers*2),
        resultChan: make(chan Result, workers*2),
    }
}

func (wp *WorkerPool) Start() {
    for i := 0; i < wp.workers; i++ {
        wp.wg.Add(1)
        go wp.worker()
    }
}

func (wp *WorkerPool) worker() {
    defer wp.wg.Done()
    for job := range wp.jobQueue {
        result := processJob(job)
        wp.resultChan <- result
    }
}

func (wp *WorkerPool) Submit(job Job) {
    wp.jobQueue <- job
}

func (wp *WorkerPool) Close() {
    close(wp.jobQueue)
    wp.wg.Wait()
    close(wp.resultChan)
}

2. Channel Optimization

// Use buffered channels when appropriate
func processWithBufferedChannel(data []int) {
    results := make(chan int, len(data)) // Buffered

    for _, item := range data {
        go func(val int) {
            results <- processItem(val)
        }(item)
    }

    for i := 0; i < len(data); i++ {
        result := <-results
        fmt.Println(result)
    }
}

3. Sync.Pool for Object Reuse

var bufferPool = sync.Pool{
    New: func() interface{} {
        return make([]byte, 0, 1024)
    },
}

func getBuffer() []byte {
    return bufferPool.Get().([]byte)
}

func putBuffer(buf []byte) {
    buf = buf[:0] // Reset length
    bufferPool.Put(buf)
}

func processData(data []byte) {
    buf := getBuffer()
    defer putBuffer(buf)

    // Use buffer
    buf = append(buf, data...)
    // Process...
}

CPU Optimization

1. Loop Unrolling

// Bad - many loop iterations
func sumArray(data []int) int {
    sum := 0
    for _, v := range data {
        sum += v
    }
    return sum
}

// Good - unroll small loops
func sumArrayUnrolled(data []int) int {
    sum := 0
    i := 0

    // Unroll by 4
    for i < len(data)-3 {
        sum += data[i] + data[i+1] + data[i+2] + data[i+3]
        i += 4
    }

    // Handle remaining elements
    for i < len(data) {
        sum += data[i]
        i++
    }

    return sum
}

2. Avoid Function Calls in Hot Paths

// Bad - function call overhead
func processItems(items []Item) {
    for _, item := range items {
        if isValid(item) { // Function call
            process(item)  // Function call
        }
    }
}

// Good - inline simple operations
func processItems(items []Item) {
    for _, item := range items {
        if item.valid && item.value > 0 { // Inline
            item.value *= 2 // Inline
        }
    }
}

3. Use Appropriate Data Structures

// Bad - slice for frequent lookups
func findInSlice(data []string, target string) bool {
    for _, item := range data {
        if item == target {
            return true
        }
    }
    return false
}

// Good - map for O(1) lookups
func findInMap(data map[string]bool, target string) bool {
    return data[target]
}

Advanced Optimization Techniques

1. SIMD with Assembly

//go:noescape
func addInt32s(a, b []int32, result []int32)

// Assembly implementation for SIMD
// This would be in a .s file

2. Memory Layout Optimization

// Bad - poor cache locality
type BadStruct struct {
    name    string
    id      int
    active  bool
    data    []byte
    count   int
}

// Good - group related fields
type GoodStruct struct {
    id      int
    count   int
    active  bool
    name    string
    data    []byte
}

3. Compiler Optimizations

// Use build tags for optimization
//go:build !debug
// +build !debug

func expensiveOperation() {
    // Optimized version
}

//go:build debug
// +build debug

func expensiveOperation() {
    // Debug version with logging
    log.Println("Starting expensive operation")
    // ... operation
}

Profiling Tools

1. pprof Web Interface

import _ "net/http/pprof"

func main() {
    go func() {
        log.Println(http.ListenAndServe("localhost:6060", nil))
    }()

    // Your application code
}

2. Runtime Metrics

import "runtime"

func printMemStats() {
    var m runtime.MemStats
    runtime.ReadMemStats(&m)

    fmt.Printf("Alloc = %d KB", m.Alloc/1024)
    fmt.Printf("\tTotalAlloc = %d KB", m.TotalAlloc/1024)
    fmt.Printf("\tSys = %d KB", m.Sys/1024)
    fmt.Printf("\tNumGC = %d\n", m.NumGC)
}

3. Trace Analysis

import (
    "os"
    "runtime/trace"
)

func main() {
    f, err := os.Create("trace.out")
    if err != nil {
        log.Fatal(err)
    }
    defer f.Close()

    trace.Start(f)
    defer trace.Stop()

    // Your code here
}

Performance Best Practices

1. Measure Before Optimizing

// Always benchmark your changes
func BenchmarkOptimized(b *testing.B) {
    b.ResetTimer()
    for i := 0; i < b.N; i++ {
        // Your optimized code
    }
}

2. Use the Right Tool

CPU bound: Optimize algorithms, use SIMD
Memory bound: Reduce allocations, improve cache locality
I/O bound: Use async operations, connection pooling

3. Profile in Production

// Add profiling endpoints
func setupProfiling() {
    http.HandleFunc("/debug/pprof/", pprof.Index)
    http.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
    http.HandleFunc("/debug/pprof/profile", pprof.Profile)
    http.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
    http.HandleFunc("/debug/pprof/trace", pprof.Trace)
}

Common Performance Pitfalls

Premature optimization - Measure first!
Ignoring memory allocations - Use profiling tools
Not using buffered channels - When you know the capacity
String concatenation in loops - Use strings.Builder
Not reusing objects - Use sync.Pool

Real-World Example: HTTP Server Optimization

func optimizedHTTPServer() {
    // Use connection pooling
    transport := &http.Transport{
        MaxIdleConns:        100,
        MaxIdleConnsPerHost: 10,
        IdleConnTimeout:     90 * time.Second,
    }

    client := &http.Client{Transport: transport}

    // Use worker pool for processing
    pool := NewWorkerPool(runtime.NumCPU())
    pool.Start()
    defer pool.Close()

    // Process requests
    for req := range requestChan {
        pool.Submit(req)
    }
}

Performance optimization in Go is about understanding the trade-offs and measuring the impact of your changes. Use the profiling tools, benchmark your code, and optimize based on real data, not assumptions.

Remember: Premature optimization is the root of all evil - but when you need performance, Go gives you the tools to achieve it! 🚀

Menu

Go Performance Optimization: From Basics to Advanced