import PageData from '../models/page.js'; import Pages from '../controllers/pages.js'; import urlify from '../utils/urlify.js'; import Page from '../models/page.js'; let globalWords: { [key: string]: {[key: string]: number} } = Object.create(null); let globalPages: PageData[] = []; class Search { /** * Initialize search */ public async init() { if (globalWords && Object.keys(globalWords).length) { return Promise.resolve(); } await this.syncDB(); } /** * Load all pages from DB and update globalWords * Use this method when any page was updated */ public async syncDB() { globalWords = Object.create(null); globalPages = await this.getPages(); /** * Process all pages */ for await (const page of globalPages) { /** * Read content blocks from page */ for await (const block of page.body.blocks) { const blockRatio = this.getBlockRatio(block); const blockContent = this.getCleanTextFromBlock(block); const blockWords: string[] = this.splitTextToWords(blockContent); /** * Process list of words in a block */ for await (const word of blockWords) { if (!globalWords[word]) { globalWords[word] = Object.create(null); } if (page._id) { if (!globalWords[word][page._id]) { globalWords[word][page._id] = 0; } /** * Add page id to the list of pages with this word */ globalWords[word][page._id] += blockRatio; } } } } console.log('Done'); } /** * Search for pages by given query * @param searchString */ public async query(searchString: string) { await this.init(); const searchWords = this.splitTextToWords(searchString); const goodPages = (await this.getPagesByWords(searchWords)) .slice(0, 10); const returnPages: {[key: string]: string|number, ratio: number}[] = []; goodPages.forEach(({ pageId, ratio }) => { const page = globalPages.filter(page => page._id === pageId).pop(); if (!page) { return; } let section = ''; page.body.blocks.forEach((block: any) => { let koef = 1; let blockContent = this.getCleanTextFromBlock(block); let shortBody = blockContent; if (block.type === 'header') { section = blockContent; } searchWords.forEach(word => { if (blockContent.toLowerCase().indexOf(word) !== -1) { koef *= 10; } }) shortBody = this.highlightSubstring(shortBody, searchWords); if (koef > 0) { returnPages.push({ ...page, shortBody, anchor: urlify(section), section, ratio: ratio * koef, }) } }); }); return { suggestions: ['description', 'about', 'contact'], pages: returnPages .sort((a, b) => b.ratio - a.ratio) .slice(0, 15) } } /** * * @private */ private async getPages(): Promise { return await Pages.getAll(); } /** * Return list of pages with a given words * @param words * @private */ private async getPagesByWords(words: string[]) { const pagesList: {[key: string]: number} = {}; /** * Get list of words starting with a words from the search query */ const validWords = Object.keys(globalWords) .filter(word => { return !!words.filter(searchWord => word.indexOf(searchWord) !== -1).length }); /** * For each word get list of pages with this word */ validWords.forEach(word => { Object.keys(globalWords[word]) .forEach(pageId => { if (!pagesList[pageId]) { pagesList[pageId] = 0; } pagesList[pageId] += globalWords[word][pageId] }) }) /** * Sort pages by frequency of given words */ const sortedPagesList = Object.keys(pagesList) .map(pageId => { return { pageId, ratio: pagesList[pageId] } }) .sort((a, b) => b.ratio - a.ratio); return sortedPagesList; } /** * Get block's ratio. It is used to calculate the weight of the words in the block * @param block * @private */ private getBlockRatio(block: any) { switch (block.type) { case 'header': if (block.data.level === 1) { return 16; } else { return 2; } case 'paragraph': return 1.1; case 'list': return 1; default: return 0; } } /** * Return clear text content from block without HTML tags and special characters * @param block * @private */ private getCleanTextFromBlock(block: any): string { let blockContent = ''; switch (block.type) { case 'header': blockContent = block.data.text; break; case 'paragraph': blockContent = block.data.text break; case 'list': blockContent = block.data.items.join(' '); break; default: return blockContent; } blockContent = this.removeHTMLTags(blockContent); blockContent = this.removeHTMLSpecialCharacters(blockContent); return blockContent; } /** * Remove HTML tags from string. Only content inside tags will be left * @param text * @private */ private removeHTMLTags(text: string) { return text.replace(/<[^>]*>?/gm, ''); } /** * Remove special characters from text. For example:   & " < > * @param text * @private */ private removeHTMLSpecialCharacters(text: string) { return text.replace(/&[^;]*;?/gm, ''); } /** * Split text to words * @param text * @private */ private splitTextToWords(text: string): string[] { return text // lowercase all words .toLowerCase() // remove punctuation .replace(/[.,;:]/gi, '') // left only letters (+cyrillic) and numbers .replace(/[^a-zа-я0-9]/gi, ' ') // remove multiple spaces .replace(/\s+/g, ' ') // remove spaces at the beginning and at the end .trim() // split to words by spaces .split(' ') // ignore words shorter than 3 chars .filter(word => word.length >= 3); } /** * Highlight substring in string with a span wrapper */ private highlightSubstring(text: string, words: string|string[]) { if (typeof words === 'string') { words = [words]; } const wordRegExp = new RegExp(words.join('|'), "ig"); const CLASS_STYLE = 'search-word'; return text.replace(wordRegExp, `$&`); } } /** * Export initialized instance */ export default new Search();