2022-08-26 15:10:51 +03:00
|
|
|
|
import PageData from '../models/page.js';
|
|
|
|
|
import Pages from '../controllers/pages.js';
|
|
|
|
|
import urlify from '../utils/urlify.js';
|
2022-09-07 18:40:28 +03:00
|
|
|
|
import Page from '../models/page.js';
|
|
|
|
|
|
2022-09-09 19:12:27 +03:00
|
|
|
|
let globalWords: { [key: string]: {[key: string]: number} } = Object.create(null);
|
2022-09-07 18:40:28 +03:00
|
|
|
|
let globalPages: PageData[] = [];
|
2022-08-11 18:28:15 +03:00
|
|
|
|
|
|
|
|
|
class Search {
|
2022-09-09 19:12:27 +03:00
|
|
|
|
/**
|
|
|
|
|
* Initialize search
|
|
|
|
|
*/
|
2022-08-26 15:08:36 +03:00
|
|
|
|
public async init() {
|
2022-09-07 18:40:28 +03:00
|
|
|
|
if (globalWords && Object.keys(globalWords).length) {
|
|
|
|
|
return Promise.resolve();
|
|
|
|
|
}
|
|
|
|
|
|
2022-09-09 19:12:27 +03:00
|
|
|
|
await this.syncDB();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Load all pages from DB and update globalWords
|
|
|
|
|
* Use this method when any page was updated
|
|
|
|
|
*/
|
|
|
|
|
public async syncDB() {
|
|
|
|
|
globalWords = Object.create(null);
|
2022-09-07 18:40:28 +03:00
|
|
|
|
globalPages = await this.getPages();
|
2022-08-26 15:08:36 +03:00
|
|
|
|
|
2022-08-11 18:28:15 +03:00
|
|
|
|
/**
|
2022-08-26 15:08:36 +03:00
|
|
|
|
* Process all pages
|
2022-08-11 18:28:15 +03:00
|
|
|
|
*/
|
2022-09-07 18:40:28 +03:00
|
|
|
|
for await (const page of globalPages) {
|
2022-08-26 15:08:36 +03:00
|
|
|
|
/**
|
|
|
|
|
* Read content blocks from page
|
|
|
|
|
*/
|
|
|
|
|
for await (const block of page.body.blocks) {
|
|
|
|
|
const blockRatio = this.getBlockRatio(block);
|
|
|
|
|
const blockContent = this.getCleanTextFromBlock(block);
|
|
|
|
|
const blockWords: string[] = this.splitTextToWords(blockContent);
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Process list of words in a block
|
|
|
|
|
*/
|
|
|
|
|
for await (const word of blockWords) {
|
2022-09-07 18:40:28 +03:00
|
|
|
|
if (!globalWords[word]) {
|
|
|
|
|
globalWords[word] = Object.create(null);
|
2022-08-26 15:08:36 +03:00
|
|
|
|
}
|
2022-08-11 18:28:15 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
if (page._id) {
|
2022-09-07 18:40:28 +03:00
|
|
|
|
if (!globalWords[word][page._id]) {
|
|
|
|
|
globalWords[word][page._id] = 0;
|
2022-08-26 15:08:36 +03:00
|
|
|
|
}
|
2022-08-11 18:28:15 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
/**
|
|
|
|
|
* Add page id to the list of pages with this word
|
|
|
|
|
*/
|
2022-09-07 18:40:28 +03:00
|
|
|
|
globalWords[word][page._id] += blockRatio;
|
2022-08-26 15:08:36 +03:00
|
|
|
|
}
|
2022-08-11 18:28:15 +03:00
|
|
|
|
}
|
2022-08-26 15:08:36 +03:00
|
|
|
|
}
|
|
|
|
|
}
|
2022-09-07 18:40:28 +03:00
|
|
|
|
|
|
|
|
|
console.log('Done');
|
2022-08-26 15:08:36 +03:00
|
|
|
|
}
|
2022-08-11 18:28:15 +03:00
|
|
|
|
|
2022-09-09 19:12:27 +03:00
|
|
|
|
/**
|
|
|
|
|
* Search for pages by given query
|
|
|
|
|
* @param searchString
|
|
|
|
|
*/
|
2022-08-26 15:08:36 +03:00
|
|
|
|
public async query(searchString: string) {
|
2022-09-07 18:40:28 +03:00
|
|
|
|
await this.init();
|
2022-08-15 18:53:57 +03:00
|
|
|
|
|
2022-08-30 18:52:15 +03:00
|
|
|
|
const searchWords = this.splitTextToWords(searchString);
|
2022-08-23 14:12:15 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
const goodPages = (await this.getPagesByWords(searchWords))
|
|
|
|
|
.slice(0, 10);
|
2022-08-11 18:28:15 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
const returnPages: {[key: string]: string|number, ratio: number}[] = [];
|
2022-08-11 18:28:15 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
goodPages.forEach(({ pageId, ratio }) => {
|
2022-09-07 18:40:28 +03:00
|
|
|
|
const page = globalPages.filter(page => page._id === pageId).pop();
|
2022-08-11 18:28:15 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
if (!page) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
2022-08-15 18:53:57 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
let section = '';
|
2022-08-11 18:28:15 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
page.body.blocks.forEach((block: any) => {
|
2022-09-07 18:40:28 +03:00
|
|
|
|
let koef = 1;
|
2022-08-11 18:28:15 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
let blockContent = this.getCleanTextFromBlock(block);
|
2022-08-11 18:28:15 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
let shortBody = blockContent;
|
2022-08-11 18:28:15 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
if (block.type === 'header') {
|
|
|
|
|
section = blockContent;
|
|
|
|
|
}
|
2022-08-11 18:28:15 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
searchWords.forEach(word => {
|
|
|
|
|
if (blockContent.toLowerCase().indexOf(word) !== -1) {
|
2022-09-07 18:40:28 +03:00
|
|
|
|
koef *= 10;
|
2022-08-26 15:08:36 +03:00
|
|
|
|
}
|
|
|
|
|
})
|
2022-08-11 18:28:15 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
shortBody = this.highlightSubstring(shortBody, searchWords);
|
2022-08-11 18:28:15 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
if (koef > 0) {
|
|
|
|
|
returnPages.push({
|
|
|
|
|
...page,
|
|
|
|
|
shortBody,
|
|
|
|
|
anchor: urlify(section),
|
|
|
|
|
section,
|
|
|
|
|
ratio: ratio * koef,
|
|
|
|
|
})
|
2022-08-11 18:28:15 +03:00
|
|
|
|
}
|
2022-08-26 15:08:36 +03:00
|
|
|
|
});
|
2022-08-11 18:28:15 +03:00
|
|
|
|
});
|
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
return {
|
2022-09-09 19:12:27 +03:00
|
|
|
|
suggestions: ['description', 'about', 'contact'],
|
2022-08-26 15:08:36 +03:00
|
|
|
|
pages: returnPages
|
|
|
|
|
.sort((a, b) => b.ratio - a.ratio)
|
|
|
|
|
.slice(0, 15)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-09-09 19:12:27 +03:00
|
|
|
|
/**
|
|
|
|
|
*
|
|
|
|
|
* @private
|
|
|
|
|
*/
|
2022-09-07 18:40:28 +03:00
|
|
|
|
private async getPages(): Promise<Page[]> {
|
2022-09-09 19:12:27 +03:00
|
|
|
|
return await Pages.getAll();
|
2022-08-26 15:08:36 +03:00
|
|
|
|
}
|
2022-08-23 14:12:15 +03:00
|
|
|
|
|
2022-09-09 19:12:27 +03:00
|
|
|
|
/**
|
|
|
|
|
* Return list of pages with a given words
|
|
|
|
|
* @param words
|
|
|
|
|
* @private
|
|
|
|
|
*/
|
2022-08-26 15:08:36 +03:00
|
|
|
|
private async getPagesByWords(words: string[]) {
|
|
|
|
|
const pagesList: {[key: string]: number} = {};
|
2022-08-23 14:12:15 +03:00
|
|
|
|
|
2022-09-09 19:12:27 +03:00
|
|
|
|
/**
|
|
|
|
|
* Get list of words starting with a words from the search query
|
|
|
|
|
*/
|
2022-09-07 18:40:28 +03:00
|
|
|
|
const validWords = Object.keys(globalWords)
|
2022-08-26 17:27:41 +03:00
|
|
|
|
.filter(word => {
|
|
|
|
|
return !!words.filter(searchWord => word.indexOf(searchWord) !== -1).length
|
2022-09-07 18:40:28 +03:00
|
|
|
|
});
|
2022-08-15 18:53:57 +03:00
|
|
|
|
|
2022-09-09 19:12:27 +03:00
|
|
|
|
/**
|
|
|
|
|
* For each word get list of pages with this word
|
|
|
|
|
*/
|
2022-09-07 18:40:28 +03:00
|
|
|
|
validWords.forEach(word => {
|
|
|
|
|
Object.keys(globalWords[word])
|
|
|
|
|
.forEach(pageId => {
|
|
|
|
|
if (!pagesList[pageId]) {
|
|
|
|
|
pagesList[pageId] = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pagesList[pageId] += globalWords[word][pageId]
|
|
|
|
|
})
|
|
|
|
|
})
|
2022-08-15 18:53:57 +03:00
|
|
|
|
|
2022-09-09 19:12:27 +03:00
|
|
|
|
/**
|
|
|
|
|
* Sort pages by frequency of given words
|
|
|
|
|
*/
|
2022-08-26 15:08:36 +03:00
|
|
|
|
const sortedPagesList = Object.keys(pagesList)
|
|
|
|
|
.map(pageId => {
|
2022-08-15 18:53:57 +03:00
|
|
|
|
return {
|
2022-08-26 15:08:36 +03:00
|
|
|
|
pageId,
|
|
|
|
|
ratio: pagesList[pageId]
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
.sort((a, b) => b.ratio - a.ratio);
|
2022-08-15 18:53:57 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
return sortedPagesList;
|
|
|
|
|
}
|
2022-08-15 18:53:57 +03:00
|
|
|
|
|
2022-09-09 19:12:27 +03:00
|
|
|
|
/**
|
|
|
|
|
* Get block's ratio. It is used to calculate the weight of the words in the block
|
|
|
|
|
* @param block
|
|
|
|
|
* @private
|
|
|
|
|
*/
|
2022-08-26 15:08:36 +03:00
|
|
|
|
private getBlockRatio(block: any) {
|
|
|
|
|
switch (block.type) {
|
|
|
|
|
case 'header':
|
2022-09-07 18:40:28 +03:00
|
|
|
|
if (block.data.level === 1) {
|
|
|
|
|
return 16;
|
|
|
|
|
} else {
|
|
|
|
|
return 2;
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
case 'paragraph':
|
2022-08-30 18:52:15 +03:00
|
|
|
|
return 1.1;
|
2022-09-07 18:40:28 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
case 'list':
|
|
|
|
|
return 1;
|
2022-09-07 18:40:28 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
default:
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
2022-08-11 18:28:15 +03:00
|
|
|
|
|
2022-09-09 19:12:27 +03:00
|
|
|
|
/**
|
|
|
|
|
* Return clear text content from block without HTML tags and special characters
|
|
|
|
|
* @param block
|
|
|
|
|
* @private
|
|
|
|
|
*/
|
2022-08-26 15:08:36 +03:00
|
|
|
|
private getCleanTextFromBlock(block: any): string {
|
|
|
|
|
let blockContent = '';
|
2022-08-11 18:28:15 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
switch (block.type) {
|
|
|
|
|
case 'header':
|
|
|
|
|
blockContent = block.data.text;
|
|
|
|
|
break;
|
2022-08-11 18:28:15 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
case 'paragraph':
|
|
|
|
|
blockContent = block.data.text
|
|
|
|
|
break;
|
2022-08-11 18:28:15 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
case 'list':
|
|
|
|
|
blockContent = block.data.items.join(' ');
|
|
|
|
|
break;
|
2022-08-11 18:28:15 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
default:
|
|
|
|
|
return blockContent;
|
2022-08-11 18:28:15 +03:00
|
|
|
|
}
|
2022-08-26 15:08:36 +03:00
|
|
|
|
|
|
|
|
|
blockContent = this.removeHTMLTags(blockContent);
|
|
|
|
|
blockContent = this.removeHTMLSpecialCharacters(blockContent);
|
|
|
|
|
|
|
|
|
|
return blockContent;
|
2022-08-11 18:28:15 +03:00
|
|
|
|
}
|
|
|
|
|
|
2022-09-09 19:12:27 +03:00
|
|
|
|
/**
|
|
|
|
|
* Remove HTML tags from string. Only content inside tags will be left
|
|
|
|
|
* @param text
|
|
|
|
|
* @private
|
|
|
|
|
*/
|
2022-08-26 15:08:36 +03:00
|
|
|
|
private removeHTMLTags(text: string) {
|
|
|
|
|
return text.replace(/<[^>]*>?/gm, '');
|
|
|
|
|
}
|
2022-08-11 18:28:15 +03:00
|
|
|
|
|
2022-09-09 19:12:27 +03:00
|
|
|
|
/**
|
|
|
|
|
* Remove special characters from text. For example: & " < >
|
|
|
|
|
* @param text
|
|
|
|
|
* @private
|
|
|
|
|
*/
|
2022-08-26 15:08:36 +03:00
|
|
|
|
private removeHTMLSpecialCharacters(text: string) {
|
|
|
|
|
return text.replace(/&[^;]*;?/gm, '');
|
2022-08-11 18:28:15 +03:00
|
|
|
|
}
|
2022-08-23 14:12:15 +03:00
|
|
|
|
|
2022-09-09 19:12:27 +03:00
|
|
|
|
/**
|
|
|
|
|
* Split text to words
|
|
|
|
|
* @param text
|
|
|
|
|
* @private
|
|
|
|
|
*/
|
2022-08-26 15:08:36 +03:00
|
|
|
|
private splitTextToWords(text: string): string[] {
|
|
|
|
|
return text
|
|
|
|
|
// lowercase all words
|
|
|
|
|
.toLowerCase()
|
|
|
|
|
|
|
|
|
|
// remove punctuation
|
|
|
|
|
.replace(/[.,;:]/gi, '')
|
|
|
|
|
|
|
|
|
|
// left only letters (+cyrillic) and numbers
|
|
|
|
|
.replace(/[^a-zа-я0-9]/gi, ' ')
|
|
|
|
|
|
|
|
|
|
// remove multiple spaces
|
|
|
|
|
.replace(/\s+/g, ' ')
|
|
|
|
|
|
2022-09-09 19:12:27 +03:00
|
|
|
|
// remove spaces at the beginning and at the end
|
2022-08-30 18:52:15 +03:00
|
|
|
|
.trim()
|
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
// split to words by spaces
|
|
|
|
|
.split(' ')
|
|
|
|
|
|
|
|
|
|
// ignore words shorter than 3 chars
|
|
|
|
|
.filter(word => word.length >= 3);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Highlight substring in string with a span wrapper
|
|
|
|
|
*/
|
|
|
|
|
private highlightSubstring(text: string, words: string|string[]) {
|
|
|
|
|
if (typeof words === 'string') {
|
|
|
|
|
words = [words];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const wordRegExp = new RegExp(words.join('|'), "ig");
|
|
|
|
|
const CLASS_STYLE = 'search-word';
|
2022-08-23 14:12:15 +03:00
|
|
|
|
|
2022-08-26 15:08:36 +03:00
|
|
|
|
return text.replace(wordRegExp, `<span class="${CLASS_STYLE}">$&</span>`);
|
2022-08-23 14:12:15 +03:00
|
|
|
|
}
|
2022-08-11 18:28:15 +03:00
|
|
|
|
}
|
|
|
|
|
|
2022-09-09 19:12:27 +03:00
|
|
|
|
/**
|
|
|
|
|
* Export initialized instance
|
|
|
|
|
*/
|
|
|
|
|
export default new Search();
|