Skip to content

HNSW Index

Overview

HNSW (Hierarchical Navigable Small World) is an approximate nearest neighbor (ANN) search algorithm that constructs a multi-layer graph structure to achieve efficient search. It provides a good balance between search speed and accuracy, making it the default choice for large-scale vector search scenarios.

Algorithm Principles

Multi-layer Structure

HNSW constructs a hierarchical structure where: - Bottom layer: Contains all data points - Upper layers: Contain a subset of points with longer "skip" connections - Search starts from the top layer and progressively narrows down

graph TB
    subgraph Layer2["Layer 2 (Top)"]
        L2_A["A"]
        L2_C["C"]
    end
    subgraph Layer1["Layer 1 (Middle)"]
        L1_A["A"]
        L1_B["B"]
        L1_C["C"]
        L1_D["D"]
    end
    subgraph Layer0["Layer 0 (Bottom)"]
        L0_A["A"]
        L0_B["B"]
        L0_C["C"]
        L0_D["D"]
        L0_E["E"]
    end
    L2_A --> L1_A
    L2_A --> L1_C
    L2_C --> L1_C
    L1_A --> L0_A
    L1_A --> L0_B
    L1_B --> L0_B
    L1_C --> L0_C
    L1_C --> L0_D
    L1_D --> L0_D
    L1_D --> L0_E

Search Process

  1. Start from the top layer entry point
  2. Find the nearest neighbor in current layer
  3. Move to the next layer with this point as entry
  4. Repeat until bottom layer is reached
  5. Perform local search at bottom layer to refine results

Implementation Details

Data Structures

type HNSWIndex struct {
    mu        sync.RWMutex
    metric    Distance
    params    *HNSWParams
    layers    []map[string]*HNSWNode // Multi-layer graph
    entry     string                 // Entry point ID
    maxLayer  int                    // Maximum layer number
}

type HNSWNode struct {
    ID       string
    Vector   []float32
    Layer    int
    Friends  map[int][]string // Layer -> Friend node IDs
}

Default Parameters

var DefaultHNSWParams = HNSWParams{
    M:              16,    // Maximum connections per node
    EfConstruction: 200,  // Candidate list size during construction
    EfSearch:       64,    // Candidate list size during search
    K:              10,    // Number of neighbors to return
}

Construction Algorithm

func (h *HNSWIndex) AddPoints(points []*PointStruct) error {
    h.mu.Lock()
    defer h.mu.Unlock()

    for _, p := range points {
        // Determine insertion layer (random with exponential distribution)
        layer := h.randomLayer()

        // Create node
        node := &HNSWNode{
            ID:      p.ID,
            Vector:  p.Vector,
            Layer:   layer,
            Friends: make(map[int][]string),
        }

        // Insert into layers
        for i := 0; i <= layer; i++ {
            if h.layers[i] == nil {
                h.layers[i] = make(map[string]*HNSWNode)
            }
            h.layers[i][p.ID] = node
        }

        // Connect to neighbors in each layer
        for i := 0; i <= layer; i++ {
            h.connectNode(node, i)
        }

        // Update entry point if needed
        if layer > h.maxLayer {
            h.maxLayer = layer
            h.entry = p.ID
        }
    }
    return nil
}

Neighbor Connection

func (h *HNSWIndex) connectNode(node *HNSWNode, layer int) {
    candidates := h.searchLayer(node.Vector, layer, h.params.EfConstruction)

    // Select M nearest neighbors
    neighbors := candidates[:min(len(candidates), h.params.M)]

    for _, neighbor := range neighbors {
        // Add bidirectional connections
        node.Friends[layer] = append(node.Friends[layer], neighbor.ID)
        neighbor.Friends[layer] = append(neighbor.Friends[layer], node.ID)
    }

    // Prune excess connections
    if len(node.Friends[layer]) > h.params.M {
        node.Friends[layer] = h.pruneConnections(node, layer)
    }
}
func (h *HNSWIndex) searchLayer(query []float32, layer int, k int) []*HNSWNode {
    visited := make(map[string]bool)
    candidates := &PriorityQueue{}

    // Start from entry point
    if entry, ok := h.layers[layer][h.entry]; ok {
        candidates.Push(entry, CalculateDistance(h.metric, query, entry.Vector))
        visited[entry.ID] = true
    }

    results := make([]*HNSWNode, 0, k)

    for candidates.Len() > 0 {
        current, _ := candidates.Pop()

        // Check if we can stop
        if len(results) >= k && results[len(results)-1].score < current.worstScore {
            break
        }

        // Search neighbors
        for _, friendID := range current.Node.Friends[layer] {
            if visited[friendID] {
                continue
            }
            visited[friendID] = true

            friend := h.layers[layer][friendID]
            dist := CalculateDistance(h.metric, query, friend.Vector)

            if len(results) < k || dist < results[len(results)-1].Score {
                candidates.Push(friend, dist)
                results = append(results, ScoredPoint{
                    ID:    friend.ID,
                    Score: dist,
                })
                sort.Slice(results, func(i, j int) bool {
                    return results[i].Score < results[j].Score
                })
                if len(results) > k {
                    results = results[:k]
                }
            }
        }
    }

    return results
}

Search Algorithm

func (h *HNSWIndex) Search(query []float32, filter *Filter, topK int) ([]ScoredPoint, error) {
    h.mu.RLock()
    defer h.mu.RUnlock()

    // Phase 1: Top-layer search for entry point
    if h.maxLayer > 0 {
        entryID := h.layers[h.maxLayer][h.entry].ID
        for i := h.maxLayer - 1; i > 0; i-- {
            candidates := h.searchLayer(query, i, 1)
            if len(candidates) > 0 {
                entryID = candidates[0].ID
            }
        }
    }

    // Phase 2: Bottom-layer search
    candidates := h.searchLayer(query, 0, h.params.EfSearch)

    // Phase 3: Apply filter and sort
    results := make([]ScoredPoint, 0)
    for _, candidate := range candidates {
        node := h.layers[0][candidate.ID]
        if filter != nil && !MatchFilter(h.getPayload(node.ID), filter) {
            continue
        }
        results = append(results, ScoredPoint{
            ID:      node.ID,
            Version: 0,
            Score:   CalculateDistance(h.metric, query, node.Vector),
        })
    }

    sort.Slice(results, func(i, j int) bool {
        return compareScores(h.metric, results[i].Score, results[j].Score)
    })

    if len(results) > topK {
        results = results[:topK]
    }

    return results, nil
}

Parameter Tuning

M (Maximum Connections)

Value Memory Build Time Search Accuracy
8 Low Fast Lower
16 Medium Medium Good (default)
32 High Slow Higher

EfConstruction

Value Build Time Memory Index Quality
100 Fast Low Lower
200 Medium Medium Good (default)
400 Slow High Higher

EfSearch

Value Latency Memory Recall
32 Low Low ~90%
64 Medium Medium ~95% (default)
128 High High ~99%

Performance Characteristics

Time Complexity

Operation Complexity Description
Add O(log n) Average insertion time
Search O(log n) Average search time
Delete O(log n) Average deletion time

Space Complexity

Component Space Description
Vectors O(n) Original vector storage
Graph O(n log n) Connection information
Total O(n log n) Linear-logarithmic

Performance Metrics

Tested on 1,000,000 128-dimensional vectors:

Metric Value
Search (topK=10) ~2ms
Index build time ~30 minutes
Memory overhead ~30%
Recall@10 ~95%

Usage Scenarios

Suitable For

  • Large-scale datasets (> 10,000 vectors)
  • Latency-sensitive applications
  • High query throughput requirements
  • Memory-constrained environments (with quantization)

Trade-offs

  • Higher memory usage compared to Flat
  • Index construction is slower
  • Approximate results (not exact)

Best Practices

  1. Start with defaults: Use DefaultHNSWParams for initial testing
  2. Tune incrementally: Adjust one parameter at a time
  3. Balance trade-offs: Higher accuracy comes at the cost of speed/memory
  4. Monitor metrics: Track recall, latency, and memory usage
  5. Consider quantization: Enable for very large datasets