内容简介:搜索引擎最核心的地方在于倒排索引,而倒排索引其实并不是一种具体的数据结构,确切的来说是一类。 这个实现中使用Golang中的执行日志:TODO:
搜索引擎最核心的地方在于倒排索引,而倒排索引其实并不是一种具体的数据结构,确切的来说是一类。
这个实现中使用Golang中的 map
来做倒排索引,全部代码如下:
package main import ( "io/ioutil" "log" "os" ) // Index 倒排索引索引项 type Index struct { DocumentName string Offset int } // ToParsingFile 待索引的文件 type ToParsingFile struct { FileName string Content string } // ReverseIndex 单个倒排索引具体 type ReverseIndex map[string][]Index // Blacklist 倒排索引key的黑名单 type Blacklist map[string]bool // Ignorelist 忽略字符列表 type Ignorelist map[rune]bool func readFile(fileName string) (string, error) { contentBytes, err := ioutil.ReadFile(fileName) if err != nil { log.Printf("open file failed with error: %v\n", err) return "", err } return string(contentBytes), nil } func parse(ignorelist Ignorelist, fileName, content string) ReverseIndex { reverseFile := make(ReverseIndex) lastIndex := 0 for i, c := range(content) { if _, ok := ignorelist[c]; ok { newWord := string(content[lastIndex:i]) reverseFile[newWord] = append(reverseFile[newWord], Index{fileName, lastIndex}) lastIndex = i + 1 } } return reverseFile } // 合并多个倒排索引并且剔除blacklist中的key func mergeReverseIndex(blacklist Blacklist, args ...ReverseIndex) ReverseIndex { result := make(ReverseIndex) for _, arg := range(args) { for k, vs := range(arg) { // 忽略黑名单中的key if _, ok := blacklist[k]; ok { continue } // 忽略连续断字符组成的 if len(k) == 0 { continue } // TODO: write myself a `extend` function for Golang slice for _, v := range(vs) { result[k] = append(result[k], v) } } } return result } func getDocumentNames(reverseFile ReverseIndex, keyWord string) map[string]bool { resultMap := make(map[string]bool) indexList, ok := reverseFile[keyWord] if !ok { return resultMap } for _, index := range(indexList) { resultMap[index.DocumentName] = true } return resultMap } func makeSubSetOf(map1, map2 map[string]bool) map[string]bool { var longer, shorter map[string]bool resultMap := make(map[string]bool) if len(map1) > len(map2) { longer = map1 shorter = map2 } else { longer = map2 shorter = map1 } for k := range(shorter) { if _, ok := longer[k]; ok { resultMap[k] = true } } return resultMap } // 搜索,目前的检索方案为: // 检索出每个关键字所在的文章 // 对关键字做交集处理 // 取出所有关键字都存在的文章然后返回 func search(reverseFile ReverseIndex, keyWords ...string) []string { result := make([]string, 0) var finalDocumentNames map[string]bool for _, keyWord := range(keyWords) { documentNames := getDocumentNames(reverseFile, keyWord) if finalDocumentNames == nil { finalDocumentNames = documentNames } finalDocumentNames = makeSubSetOf(finalDocumentNames, documentNames) } for doc := range(finalDocumentNames) { result = append(result, doc) } return result } func main() { contents := make([]ToParsingFile, 0) files, err := ioutil.ReadDir("./docs") if err != nil { log.Panicf("read dir(./docs) failed, aborting...") } for _, file := range(files) { fileName := file.Name() content, err := readFile("./docs/" + fileName) if err != nil { log.Panicf("read file(%s) failed!", fileName) } contents = append(contents, ToParsingFile{fileName, content}) } blacklist := Blacklist{"a": false, "is": false} ignorelist := Ignorelist{ ' ': false, '\t': false, '\n': false, '\r': false, ',': false, '.': false, '"': false, ':': false, '`': false, '(': false, ')': false, '[': false, ']': false, '{': false, '}': false, '-': false, '_': false, '+': false, '=': false, '\\': false, '|': false, '\'': false, '<': false, '>': false, '$': false, } var reverseFiles []ReverseIndex for _, toParsingFile := range(contents) { reverseFiles = append(reverseFiles, parse(ignorelist, toParsingFile.FileName, toParsingFile.Content)) } reverseFile := mergeReverseIndex(blacklist, reverseFiles...) log.Printf("we're going to create reverse index") for k, vs := range reverseFile { log.Printf("Key: %s -> ", k) for _, v := range vs { log.Printf("in doc %s with offset %d, ", v.DocumentName, v.Offset) } } log.Printf("looping done") keyWords := os.Args[1:] searchResults := search(reverseFile, keyWords...) log.Printf("========================= Search Engine =================\n") log.Printf("searing %v...\n", keyWords) for _, searchResult := range(searchResults) { log.Printf("in file: %s\n", searchResult) } log.Printf("done.\n") }
执行日志:
$ go run reverse_file.go bigbang 2017/06/02 23:12:19 we're going to create reverse index 2017/06/02 23:12:19 Key: inherited -> 2017/06/02 23:12:19 in doc dev.rst with offset 297, 2017/06/02 23:12:19 Key: are -> 2017/06/02 23:12:19 in doc dev.rst with offset 16, 2017/06/02 23:12:19 Key: Installation -> 2017/06/02 23:12:19 in doc user.rst with offset 105, 2017/06/02 23:12:19 Key: about -> 2017/06/02 23:12:19 in doc user.rst with offset 78, 2017/06/02 23:12:19 Key: Using -> 2017/06/02 23:12:19 in doc user.rst with offset 266, 2017/06/02 23:12:19 Key: User -> 2017/06/02 23:12:19 in doc user.rst with offset 0, 2017/06/02 23:12:19 Key: of -> 2017/06/02 23:12:19 in doc dev.rst with offset 95, 2017/06/02 23:12:19 in doc user.rst with offset 36, 2017/06/02 23:12:19 in doc user.rst with offset 560, 2017/06/02 23:12:19 Key: method -> 2017/06/02 23:12:19 in doc dev.rst with offset 77, 2017/06/02 23:12:19 Key: by -> 2017/06/02 23:12:19 in doc user.rst with offset 337, 2017/06/02 23:12:19 Key: use -> 2017/06/02 23:12:19 in doc user.rst with offset 91, 2017/06/02 23:12:19 in doc user.rst with offset 309, 2017/06/02 23:12:19 Key: using -> 2017/06/02 23:12:19 in doc user.rst with offset 582, 2017/06/02 23:12:19 Key: set -> 2017/06/02 23:12:19 in doc user.rst with offset 468, 2017/06/02 23:12:19 in doc user.rst with offset 698, 2017/06/02 23:12:19 Key: for -> 2017/06/02 23:12:19 in doc dev.rst with offset 28, 2017/06/02 23:12:19 in doc dev.rst with offset 119, 2017/06/02 23:12:19 Key: loop -> 2017/06/02 23:12:19 in doc dev.rst with offset 231, 2017/06/02 23:12:19 in doc user.rst with offset 323, 2017/06/02 23:12:19 in doc user.rst with offset 381, 2017/06/02 23:12:19 in doc user.rst with offset 478, 2017/06/02 23:12:19 in doc user.rst with offset 567, 2017/06/02 23:12:19 in doc user.rst with offset 655, 2017/06/02 23:12:19 in doc user.rst with offset 679, 2017/06/02 23:12:19 in doc user.rst with offset 708, 2017/06/02 23:12:19 in doc user.rst with offset 713, 2017/06/02 23:12:19 Key: undoc -> 2017/06/02 23:12:19 in doc dev.rst with offset 279, 2017/06/02 23:12:19 Key: bigbang -> 2017/06/02 23:12:19 in doc dev.rst with offset 318, 2017/06/02 23:12:19 in doc user.rst with offset 720, 2017/06/02 23:12:19 Key: requires -> 2017/06/02 23:12:19 in doc user.rst with offset 168, 2017/06/02 23:12:19 Key: section -> 2017/06/02 23:12:19 in doc user.rst with offset 28, 2017/06/02 23:12:19 Key: templates -> 2017/06/02 23:12:19 in doc .gitignore with offset 16, 2017/06/02 23:12:19 Key: documentation -> 2017/06/02 23:12:19 in doc dev.rst with offset 102, 2017/06/02 23:12:19 in doc user.rst with offset 43, 2017/06/02 23:12:19 Key: or -> 2017/06/02 23:12:19 in doc dev.rst with offset 74, 2017/06/02 23:12:19 Key: event -> 2017/06/02 23:12:19 in doc dev.rst with offset 225, 2017/06/02 23:12:19 in doc user.rst with offset 317, 2017/06/02 23:12:19 in doc user.rst with offset 375, 2017/06/02 23:12:19 in doc user.rst with offset 472, 2017/06/02 23:12:19 in doc user.rst with offset 673, 2017/06/02 23:12:19 in doc user.rst with offset 702, 2017/06/02 23:12:19 Key: build -> 2017/06/02 23:12:19 in doc .gitignore with offset 1, 2017/06/02 23:12:19 Key: this -> 2017/06/02 23:12:19 in doc dev.rst with offset 85, 2017/06/02 23:12:19 Key: members -> 2017/06/02 23:12:19 in doc dev.rst with offset 186, 2017/06/02 23:12:19 in doc dev.rst with offset 267, 2017/06/02 23:12:19 in doc dev.rst with offset 285, 2017/06/02 23:12:19 in doc dev.rst with offset 307, 2017/06/02 23:12:19 Key: autofunction -> 2017/06/02 23:12:19 in doc dev.rst with offset 199, 2017/06/02 23:12:19 Key: Alternatively -> 2017/06/02 23:12:19 in doc user.rst with offset 518, 2017/06/02 23:12:19 Key: Use -> 2017/06/02 23:12:19 in doc user.rst with offset 190, 2017/06/02 23:12:19 Key: instance -> 2017/06/02 23:12:19 in doc user.rst with offset 551, 2017/06/02 23:12:19 Key: API -> 2017/06/02 23:12:19 in doc dev.rst with offset 0, 2017/06/02 23:12:19 Key: the -> 2017/06/02 23:12:19 in doc dev.rst with offset 98, 2017/06/02 23:12:19 in doc user.rst with offset 39, 2017/06/02 23:12:19 in doc user.rst with offset 313, 2017/06/02 23:12:19 in doc user.rst with offset 362, 2017/06/02 23:12:19 in doc user.rst with offset 563, 2017/06/02 23:12:19 Key: console -> 2017/06/02 23:12:19 in doc user.rst with offset 230, 2017/06/02 23:12:19 Key: autoclass -> 2017/06/02 23:12:19 in doc dev.rst with offset 148, 2017/06/02 23:12:19 in doc dev.rst with offset 240, 2017/06/02 23:12:19 Key: looking -> 2017/06/02 23:12:19 in doc dev.rst with offset 20, 2017/06/02 23:12:19 Key: provided -> 2017/06/02 23:12:19 in doc user.rst with offset 328, 2017/06/02 23:12:19 Key: If -> 2017/06/02 23:12:19 in doc dev.rst with offset 9, 2017/06/02 23:12:19 Key: 3 -> 2017/06/02 23:12:19 in doc user.rst with offset 184, 2017/06/02 23:12:19 Key: create -> 2017/06/02 23:12:19 in doc user.rst with offset 541, 2017/06/02 23:12:19 Key: manually -> 2017/06/02 23:12:19 in doc user.rst with offset 572, 2017/06/02 23:12:19 Key: an -> 2017/06/02 23:12:19 in doc user.rst with offset 548, 2017/06/02 23:12:19 Key: can -> 2017/06/02 23:12:19 in doc user.rst with offset 537, 2017/06/02 23:12:19 Key: function -> 2017/06/02 23:12:19 in doc dev.rst with offset 58, 2017/06/02 23:12:19 Key: new -> 2017/06/02 23:12:19 in doc dev.rst with offset 221, 2017/06/02 23:12:19 in doc user.rst with offset 669, 2017/06/02 23:12:19 Key: policy -> 2017/06/02 23:12:19 in doc user.rst with offset 386, 2017/06/02 23:12:19 in doc user.rst with offset 483, 2017/06/02 23:12:19 Key: It -> 2017/06/02 23:12:19 in doc user.rst with offset 165, 2017/06/02 23:12:19 Key: from -> 2017/06/02 23:12:19 in doc user.rst with offset 154, 2017/06/02 23:12:19 Key: available -> 2017/06/02 23:12:19 in doc user.rst with offset 144, 2017/06/02 23:12:19 Key: Guide -> 2017/06/02 23:12:19 in doc user.rst with offset 5, 2017/06/02 23:12:19 Key: uvloop -> 2017/06/02 23:12:19 in doc dev.rst with offset 130, 2017/06/02 23:12:19 in doc dev.rst with offset 160, 2017/06/02 23:12:19 in doc dev.rst with offset 214, 2017/06/02 23:12:19 in doc dev.rst with offset 252, 2017/06/02 23:12:19 in doc user.rst with offset 95, 2017/06/02 23:12:19 in doc user.rst with offset 133, 2017/06/02 23:12:19 in doc user.rst with offset 257, 2017/06/02 23:12:19 in doc user.rst with offset 272, 2017/06/02 23:12:19 in doc user.rst with offset 341, 2017/06/02 23:12:19 in doc user.rst with offset 367, 2017/06/02 23:12:19 in doc user.rst with offset 449, 2017/06/02 23:12:19 in doc user.rst with offset 490, 2017/06/02 23:12:19 in doc user.rst with offset 644, 2017/06/02 23:12:19 in doc user.rst with offset 662, 2017/06/02 23:12:19 Key: python -> 2017/06/02 23:12:19 in doc user.rst with offset 411, 2017/06/02 23:12:19 in doc user.rst with offset 606, 2017/06/02 23:12:19 Key: class -> 2017/06/02 23:12:19 in doc dev.rst with offset 68, 2017/06/02 23:12:19 Key: how -> 2017/06/02 23:12:19 in doc user.rst with offset 84, 2017/06/02 23:12:19 Key: it -> 2017/06/02 23:12:19 in doc user.rst with offset 209, 2017/06/02 23:12:19 Key: provides -> 2017/06/02 23:12:19 in doc user.rst with offset 57, 2017/06/02 23:12:19 Key: To -> 2017/06/02 23:12:19 in doc user.rst with offset 293, 2017/06/02 23:12:19 Key: 5 -> 2017/06/02 23:12:19 in doc user.rst with offset 186, 2017/06/02 23:12:19 Key: part -> 2017/06/02 23:12:19 in doc dev.rst with offset 90, 2017/06/02 23:12:19 Key: pip -> 2017/06/02 23:12:19 in doc user.rst with offset 194, 2017/06/02 23:12:19 in doc user.rst with offset 245, 2017/06/02 23:12:19 Key: asyncio -> 2017/06/02 23:12:19 in doc user.rst with offset 301, 2017/06/02 23:12:19 in doc user.rst with offset 430, 2017/06/02 23:12:19 in doc user.rst with offset 460, 2017/06/02 23:12:19 in doc user.rst with offset 625, 2017/06/02 23:12:19 in doc user.rst with offset 690, 2017/06/02 23:12:19 Key: PyPI -> 2017/06/02 23:12:19 in doc user.rst with offset 159, 2017/06/02 23:12:19 Key: static -> 2017/06/02 23:12:19 in doc .gitignore with offset 8, 2017/06/02 23:12:19 Key: specific -> 2017/06/02 23:12:19 in doc dev.rst with offset 49, 2017/06/02 23:12:19 Key: make -> 2017/06/02 23:12:19 in doc user.rst with offset 296, 2017/06/02 23:12:19 Key: Python -> 2017/06/02 23:12:19 in doc user.rst with offset 177, 2017/06/02 23:12:19 Key: code -> 2017/06/02 23:12:19 in doc user.rst with offset 217, 2017/06/02 23:12:19 in doc user.rst with offset 398, 2017/06/02 23:12:19 in doc user.rst with offset 593, 2017/06/02 23:12:19 Key: block -> 2017/06/02 23:12:19 in doc user.rst with offset 222, 2017/06/02 23:12:19 in doc user.rst with offset 403, 2017/06/02 23:12:19 in doc user.rst with offset 598, 2017/06/02 23:12:19 Key: you -> 2017/06/02 23:12:19 in doc dev.rst with offset 12, 2017/06/02 23:12:19 in doc dev.rst with offset 123, 2017/06/02 23:12:19 in doc user.rst with offset 350, 2017/06/02 23:12:19 in doc user.rst with offset 533, 2017/06/02 23:12:19 Key: EventLoopPolicy -> 2017/06/02 23:12:19 in doc dev.rst with offset 167, 2017/06/02 23:12:19 in doc user.rst with offset 497, 2017/06/02 23:12:19 Key: to -> 2017/06/02 23:12:19 in doc user.rst with offset 88, 2017/06/02 23:12:19 in doc user.rst with offset 198, 2017/06/02 23:12:19 Key: This -> 2017/06/02 23:12:19 in doc user.rst with offset 23, 2017/06/02 23:12:19 Key: on -> 2017/06/02 23:12:19 in doc dev.rst with offset 44, 2017/06/02 23:12:19 Key: import -> 2017/06/02 23:12:19 in doc user.rst with offset 423, 2017/06/02 23:12:19 in doc user.rst with offset 442, 2017/06/02 23:12:19 in doc user.rst with offset 618, 2017/06/02 23:12:19 in doc user.rst with offset 637, 2017/06/02 23:12:19 Key: install -> 2017/06/02 23:12:19 in doc user.rst with offset 201, 2017/06/02 23:12:19 in doc user.rst with offset 249, 2017/06/02 23:12:19 in doc user.rst with offset 354, 2017/06/02 23:12:19 Key: Loop -> 2017/06/02 23:12:19 in doc dev.rst with offset 259, 2017/06/02 23:12:19 Key: information -> 2017/06/02 23:12:19 in doc dev.rst with offset 32, 2017/06/02 23:12:19 in doc user.rst with offset 66, 2017/06/02 23:12:19 looping done 2017/06/02 23:12:19 ========================= Search Engine ================= 2017/06/02 23:12:19 searing [bigbang]... 2017/06/02 23:12:19 in file: dev.rst 2017/06/02 23:12:19 in file: user.rst 2017/06/02 23:12:19 done.
TODO:
- 使用B树替代哈希表做倒排索引
- 使用文件持久化倒排索引
以上所述就是小编给大家介绍的《自己写个搜索引擎》,希望对大家有所帮助,如果大家有任何疑问请给我留言,小编会及时回复大家的。在此也非常感谢大家对 码农网 的支持!
猜你喜欢:本站部分资源来源于网络,本站转载出于传递更多信息之目的,版权归原作者或者来源机构所有,如转载稿涉及版权问题,请联系我们。
The Dream Machine
M. Mitchell Waldrop / Penguin Books / 2002-8 / USD 16.00
While most people may not be familiar with the name J. C. R. Licklider, he was the guiding spirit behind the greatest revolution of the modern era. At a time when most computers were big, ponderous ma......一起来看看 《The Dream Machine》 这本书的介绍吧!