// Copyright 2009 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // This file contains the infrastructure to create an // (identifier) index for a set of Go files. // // Basic indexing algorithm: // - traverse all .go files of the file tree specified by root // - for each word (identifier) encountered, collect all occurences (spots) // into a list; this produces a list of spots for each word // - reduce the lists: from a list of spots to a list of FileRuns, // and from a list of FileRuns into a list of PakRuns // - make a HitList from the PakRuns // // Details: // - keep two lists per word: one containing package-level declarations // that have snippets, and one containing all other spots // - keep the snippets in a separate table indexed by snippet index // and store the snippet index in place of the line number in a SpotInfo // (the line number for spots with snippets is stored in the snippet) // - at the end, create lists of alternative spellings for a given // word package main import ( "container/vector"; "go/ast"; "go/parser"; "go/token"; "go/scanner"; "os"; pathutil "path"; "sort"; "strings"; ) // ---------------------------------------------------------------------------- // RunList // A RunList is a vector of entries that can be sorted according to some // criteria. A RunList may be compressed by grouping "runs" of entries // which are equal (according to the sort critera) into a new RunList of // runs. For instance, a RunList containing pairs (x, y) may be compressed // into a RunList containing pair runs (x, {y}) where each run consists of // a list of y's with the same x. type RunList struct { vector.Vector; less func(x, y interface{}) bool; } func (h *RunList) Less(i, j int) bool { return h.less(h.At(i), h.At(j)) } func (h *RunList) sort(less func(x, y interface{}) bool) { h.less = less; sort.Sort(h); } // Compress entries which are the same according to a sort criteria // (specified by less) into "runs". func (h *RunList) reduce(less func(x, y interface{}) bool, newRun func(h *RunList, i, j int) interface{}) *RunList { // create runs of entries with equal values h.sort(less); // for each run, make a new run object and collect them in a new RunList var hh RunList; i := 0; for j := 0; j < h.Len(); j++ { if less(h.At(i), h.At(j)) { hh.Push(newRun(h, i, j)); i = j; // start a new run } } // add final run, if any if i < h.Len() { hh.Push(newRun(h, i, h.Len())) } return &hh; } // ---------------------------------------------------------------------------- // SpotInfo // A SpotInfo value describes a particular identifier spot in a given file; // It encodes three values: the SpotKind (declaration or use), a line or // snippet index "lori", and whether it's a line or index. // // The following encoding is used: // // bits 32 4 1 0 // value [lori|kind|isIndex] // type SpotInfo uint32 // SpotKind describes whether an identifier is declared (and what kind of // declaration) or used. type SpotKind uint32 const ( PackageClause SpotKind = iota; ImportDecl; ConstDecl; TypeDecl; VarDecl; FuncDecl; MethodDecl; Use; nKinds; ) func init() { // sanity check: if nKinds is too large, the SpotInfo // accessor functions may need to be updated if nKinds > 8 { panic() } } // makeSpotInfo makes a SpotInfo. func makeSpotInfo(kind SpotKind, lori int, isIndex bool) SpotInfo { // encode lori: bits [4..32) x := SpotInfo(lori) << 4; if int(x>>4) != lori { // lori value doesn't fit - since snippet indices are // most certainly always smaller then 1<<28, this can // only happen for line numbers; give it no line number (= 0) x = 0 } // encode kind: bits [1..4) x |= SpotInfo(kind) << 1; // encode isIndex: bit 0 if isIndex { x |= 1 } return x; } func (x SpotInfo) Kind() SpotKind { return SpotKind(x >> 1 & 7) } func (x SpotInfo) Lori() int { return int(x >> 4) } func (x SpotInfo) IsIndex() bool { return x&1 != 0 } // ---------------------------------------------------------------------------- // KindRun // Debugging support. Disable to see multiple entries per line. const removeDuplicates = true // A KindRun is a run of SpotInfos of the same kind in a given file. type KindRun struct { Kind SpotKind; Infos []SpotInfo; } // KindRuns are sorted by line number or index. Since the isIndex bit // is always the same for all infos in one list we can compare lori's. func (f *KindRun) Len() int { return len(f.Infos) } func (f *KindRun) Less(i, j int) bool { return f.Infos[i].Lori() < f.Infos[j].Lori() } func (f *KindRun) Swap(i, j int) { f.Infos[i], f.Infos[j] = f.Infos[j], f.Infos[i] } // FileRun contents are sorted by Kind for the reduction into KindRuns. func lessKind(x, y interface{}) bool { return x.(SpotInfo).Kind() < y.(SpotInfo).Kind() } // newKindRun allocates a new KindRun from the SpotInfo run [i, j) in h. func newKindRun(h *RunList, i, j int) interface{} { kind := h.At(i).(SpotInfo).Kind(); infos := make([]SpotInfo, j-i); k := 0; for ; i < j; i++ { infos[k] = h.At(i).(SpotInfo); k++; } run := &KindRun{kind, infos}; // Spots were sorted by file and kind to create this run. // Within this run, sort them by line number or index. sort.Sort(run); if removeDuplicates { // Since both the lori and kind field must be // same for duplicates, and since the isIndex // bit is always the same for all infos in one // list we can simply compare the entire info. k := 0; var prev SpotInfo; for i, x := range infos { if x != prev || i == 0 { infos[k] = x; k++; prev = x; } } run.Infos = infos[0:k]; } return run; } // ---------------------------------------------------------------------------- // FileRun // A Pak describes a Go package. type Pak struct { Path string; // path of directory containing the package Name string; // package name as declared by package clause } // Paks are sorted by name (primary key) and by import path (secondary key). func (p *Pak) less(q *Pak) bool { return p.Name < q.Name || p.Name == q.Name && p.Path < q.Path } // A File describes a Go file. type File struct { Path string; // complete file name Pak Pak; // the package to which the file belongs } // A Spot describes a single occurence of a word. type Spot struct { File *File; Info SpotInfo; } // A FileRun is a list of KindRuns belonging to the same file. type FileRun struct { File *File; Groups []*KindRun; } // Spots are sorted by path for the reduction into FileRuns. func lessSpot(x, y interface{}) bool { return x.(Spot).File.Path < y.(Spot).File.Path } // newFileRun allocates a new FileRun from the Spot run [i, j) in h. func newFileRun(h0 *RunList, i, j int) interface{} { file := h0.At(i).(Spot).File; // reduce the list of Spots into a list of KindRuns var h1 RunList; h1.Vector.Resize(j-i, 0); k := 0; for ; i < j; i++ { h1.Set(k, h0.At(i).(Spot).Info); k++; } h2 := h1.reduce(lessKind, newKindRun); // create the FileRun groups := make([]*KindRun, h2.Len()); for i := 0; i < h2.Len(); i++ { groups[i] = h2.At(i).(*KindRun) } return &FileRun{file, groups}; } // ---------------------------------------------------------------------------- // PakRun // A PakRun describes a run of *FileRuns of a package. type PakRun struct { Pak Pak; Files []*FileRun; } // Sorting support for files within a PakRun. func (p *PakRun) Len() int { return len(p.Files) } func (p *PakRun) Less(i, j int) bool { return p.Files[i].File.Path < p.Files[j].File.Path } func (p *PakRun) Swap(i, j int) { p.Files[i], p.Files[j] = p.Files[j], p.Files[i] } // FileRuns are sorted by package for the reduction into PakRuns. func lessFileRun(x, y interface{}) bool { return x.(*FileRun).File.Pak.less(&y.(*FileRun).File.Pak) } // newPakRun allocates a new PakRun from the *FileRun run [i, j) in h. func newPakRun(h *RunList, i, j int) interface{} { pak := h.At(i).(*FileRun).File.Pak; files := make([]*FileRun, j-i); k := 0; for ; i < j; i++ { files[k] = h.At(i).(*FileRun); k++; } run := &PakRun{pak, files}; sort.Sort(run); // files were sorted by package; sort them by file now return run; } // ---------------------------------------------------------------------------- // HitList // A HitList describes a list of PakRuns. type HitList []*PakRun // PakRuns are sorted by package. func lessPakRun(x, y interface{}) bool { return x.(*PakRun).Pak.less(&y.(*PakRun).Pak) } func reduce(h0 *RunList) HitList { // reduce a list of Spots into a list of FileRuns h1 := h0.reduce(lessSpot, newFileRun); // reduce a list of FileRuns into a list of PakRuns h2 := h1.reduce(lessFileRun, newPakRun); // sort the list of PakRuns by package h2.sort(lessPakRun); // create a HitList h := make(HitList, h2.Len()); for i := 0; i < h2.Len(); i++ { h[i] = h2.At(i).(*PakRun) } return h; } func (h HitList) filter(pakname string) HitList { // determine number of matching packages (most of the time just one) n := 0; for _, p := range h { if p.Pak.Name == pakname { n++ } } // create filtered HitList hh := make(HitList, n); i := 0; for _, p := range h { if p.Pak.Name == pakname { hh[i] = p; i++; } } return hh; } // ---------------------------------------------------------------------------- // AltWords type wordPair struct { canon string; // canonical word spelling (all lowercase) alt string; // alternative spelling } // An AltWords describes a list of alternative spellings for a // canonical (all lowercase) spelling of a word. type AltWords struct { Canon string; // canonical word spelling (all lowercase) Alts []string; // alternative spelling for the same word } // wordPairs are sorted by their canonical spelling. func lessWordPair(x, y interface{}) bool { return x.(*wordPair).canon < y.(*wordPair).canon } // newAltWords allocates a new AltWords from the *wordPair run [i, j) in h. func newAltWords(h *RunList, i, j int) interface{} { canon := h.At(i).(*wordPair).canon; alts := make([]string, j-i); k := 0; for ; i < j; i++ { alts[k] = h.At(i).(*wordPair).alt; k++; } return &AltWords{canon, alts}; } func (a *AltWords) filter(s string) *AltWords { if len(a.Alts) == 1 && a.Alts[0] == s { // there are no different alternatives return nil } // make a new AltWords with the current spelling removed alts := make([]string, len(a.Alts)); i := 0; for _, w := range a.Alts { if w != s { alts[i] = w; i++; } } return &AltWords{a.Canon, alts[0:i]}; } // ---------------------------------------------------------------------------- // Indexer // Adjust these flags as seems best. const excludeMainPackages = false const excludeTestFiles = false type IndexResult struct { Decls RunList; // package-level declarations (with snippets) Others RunList; // all other occurences } // An Indexer maintains the data structures and provides the machinery // for indexing .go files under a file tree. It implements the path.Visitor // interface for walking file trees, and the ast.Visitor interface for // walking Go ASTs. type Indexer struct { words map[string]*IndexResult; // RunLists of Spots snippets vector.Vector; // vector of *Snippets, indexed by snippet indices file *File; // current file decl ast.Decl; // current decl nspots int; // number of spots encountered } func (x *Indexer) addSnippet(s *Snippet) int { index := x.snippets.Len(); x.snippets.Push(s); return index; } func (x *Indexer) visitComment(c *ast.CommentGroup) { if c != nil { ast.Walk(x, c) } } func (x *Indexer) visitIdent(kind SpotKind, id *ast.Ident) { if id != nil { lists, found := x.words[id.Value]; if !found { lists = new(IndexResult); x.words[id.Value] = lists; } if kind == Use || x.decl == nil { // not a declaration or no snippet required info := makeSpotInfo(kind, id.Pos().Line, false); lists.Others.Push(Spot{x.file, info}); } else { // a declaration with snippet index := x.addSnippet(NewSnippet(x.decl, id)); info := makeSpotInfo(kind, index, true); lists.Decls.Push(Spot{x.file, info}); } x.nspots++; } } func (x *Indexer) visitSpec(spec ast.Spec, isVarDecl bool) { switch n := spec.(type) { case *ast.ImportSpec: x.visitComment(n.Doc); x.visitIdent(ImportDecl, n.Name); for _, s := range n.Path { ast.Walk(x, s) } x.visitComment(n.Comment); case *ast.ValueSpec: x.visitComment(n.Doc); kind := ConstDecl; if isVarDecl { kind = VarDecl } for _, n := range n.Names { x.visitIdent(kind, n) } ast.Walk(x, n.Type); for _, v := range n.Values { ast.Walk(x, v) } x.visitComment(n.Comment); case *ast.TypeSpec: x.visitComment(n.Doc); x.visitIdent(TypeDecl, n.Name); ast.Walk(x, n.Type); x.visitComment(n.Comment); } } func (x *Indexer) Visit(node interface{}) ast.Visitor { // TODO(gri): methods in interface types are categorized as VarDecl switch n := node.(type) { case nil: return nil case *ast.Ident: x.visitIdent(Use, n) case *ast.Field: x.decl = nil; // no snippets for fields x.visitComment(n.Doc); for _, m := range n.Names { x.visitIdent(VarDecl, m) } ast.Walk(x, n.Type); for _, s := range n.Tag { ast.Walk(x, s) } x.visitComment(n.Comment); case *ast.DeclStmt: if decl, ok := n.Decl.(*ast.GenDecl); ok { // local declarations can only be *ast.GenDecls x.decl = nil; // no snippets for local declarations x.visitComment(decl.Doc); for _, s := range decl.Specs { x.visitSpec(s, decl.Tok == token.VAR) } } else { // handle error case gracefully ast.Walk(x, n.Decl) } case *ast.GenDecl: x.decl = n; x.visitComment(n.Doc); for _, s := range n.Specs { x.visitSpec(s, n.Tok == token.VAR) } case *ast.FuncDecl: x.visitComment(n.Doc); kind := FuncDecl; if n.Recv != nil { kind = MethodDecl; ast.Walk(x, n.Recv); } x.decl = n; x.visitIdent(kind, n.Name); ast.Walk(x, n.Type); if n.Body != nil { ast.Walk(x, n.Body) } case *ast.File: x.visitComment(n.Doc); x.decl = nil; x.visitIdent(PackageClause, n.Name); for _, d := range n.Decls { ast.Walk(x, d) } // don't visit package level comments for now // to avoid duplicate visiting from individual // nodes default: return x } return nil; } func (x *Indexer) VisitDir(path string, d *os.Dir) bool { return true } func (x *Indexer) VisitFile(path string, d *os.Dir) { if !isGoFile(d) { return } if excludeTestFiles && (!isPkgFile(d) || strings.HasPrefix(path, "test/")) { return } if excludeMainPackages && pkgName(path) == "main" { return } file, err := parser.ParseFile(path, nil, parser.ParseComments); if err != nil { return // ignore files with (parse) errors } dir, _ := pathutil.Split(path); pak := Pak{dir, file.Name.Value}; x.file = &File{path, pak}; ast.Walk(x, file); } // ---------------------------------------------------------------------------- // Index type LookupResult struct { Decls HitList; // package-level declarations (with snippets) Others HitList; // all other occurences } type Index struct { words map[string]*LookupResult; // maps words to hit lists alts map[string]*AltWords; // maps canonical(words) to lists of alternative spellings snippets []*Snippet; // all snippets, indexed by snippet index nspots int; // number of spots indexed (a measure of the index size) } func canonical(w string) string { return strings.ToLower(w) } // NewIndex creates a new index for the file tree rooted at root. func NewIndex(root string) *Index { var x Indexer; // initialize Indexer x.words = make(map[string]*IndexResult); // collect all Spots pathutil.Walk(root, &x, nil); // for each word, reduce the RunLists into a LookupResult; // also collect the word with its canonical spelling in a // word list for later computation of alternative spellings words := make(map[string]*LookupResult); var wlist RunList; for w, h := range x.words { decls := reduce(&h.Decls); others := reduce(&h.Others); words[w] = &LookupResult{ Decls: decls, Others: others, }; wlist.Push(&wordPair{canonical(w), w}); } // reduce the word list {canonical(w), w} into // a list of AltWords runs {canonical(w), {w}} alist := wlist.reduce(lessWordPair, newAltWords); // convert alist into a map of alternative spellings alts := make(map[string]*AltWords); for i := 0; i < alist.Len(); i++ { a := alist.At(i).(*AltWords); alts[a.Canon] = a; } // convert snippet vector into a list snippets := make([]*Snippet, x.snippets.Len()); for i := 0; i < x.snippets.Len(); i++ { snippets[i] = x.snippets.At(i).(*Snippet) } return &Index{words, alts, snippets, x.nspots}; } // Size returns the number of different words and // spots indexed as a measure for the index size. func (x *Index) Size() (nwords int, nspots int) { return len(x.words), x.nspots } func (x *Index) LookupWord(w string) (match *LookupResult, alt *AltWords) { match, _ = x.words[w]; alt, _ = x.alts[canonical(w)]; // remove current spelling from alternatives // (if there is no match, the alternatives do // not contain the current spelling) if match != nil && alt != nil { alt = alt.filter(w) } return; } func isIdentifier(s string) bool { var S scanner.Scanner; S.Init("", strings.Bytes(s), nil, 0); if _, tok, _ := S.Scan(); tok == token.IDENT { _, tok, _ := S.Scan(); return tok == token.EOF; } return false; } // For a given query, which is either a single identifier or a qualified // identifier, Lookup returns a LookupResult, and a list of alternative // spellings, if any. If the query syntax is wrong, illegal is set. func (x *Index) Lookup(query string) (match *LookupResult, alt *AltWords, illegal bool) { ss := strings.Split(query, ".", 0); // check query syntax for _, s := range ss { if !isIdentifier(s) { illegal = true; return; } } switch len(ss) { case 1: match, alt = x.LookupWord(ss[0]) case 2: pakname := ss[0]; match, alt = x.LookupWord(ss[1]); if match != nil { // found a match - filter by package name decls := match.Decls.filter(pakname); others := match.Others.filter(pakname); match = &LookupResult{decls, others}; } default: illegal = true } return; } func (x *Index) Snippet(i int) *Snippet { // handle illegal snippet indices gracefully if 0 <= i && i < len(x.snippets) { return x.snippets[i] } return nil; }