1
0
Fork 0
mirror of https://github.com/codex-team/codex.docs.git synced 2025-07-20 21:59:41 +02:00
codex.docs/src/backend/controllers/search.ts

264 lines
6 KiB
TypeScript
Raw Normal View History

2022-08-26 15:10:51 +03:00
import PageData from '../models/page.js';
import Pages from '../controllers/pages.js';
import urlify from '../utils/urlify.js';
2022-08-11 18:28:15 +03:00
class Search {
2022-08-26 15:08:36 +03:00
private words: { [key: string]: {[key: string]: number} } = Object.create(null);
private pages: PageData[] = [];
public async init() {
this.pages = await this.getPages();
2022-08-11 18:28:15 +03:00
/**
2022-08-26 15:08:36 +03:00
* Process all pages
2022-08-11 18:28:15 +03:00
*/
2022-08-26 15:08:36 +03:00
for await (const page of this.pages) {
// if (page._id && !this.pages[page._id]) {
// this.pages[page._id] = [];
// }
/**
* Read content blocks from page
*/
for await (const block of page.body.blocks) {
const blockRatio = this.getBlockRatio(block);
const blockContent = this.getCleanTextFromBlock(block);
const blockWords: string[] = this.splitTextToWords(blockContent);
// if (page._id) {
// this.pages[page._id].push(...blockWords);
// }
/**
* Process list of words in a block
*/
for await (const word of blockWords) {
if (!this.words[word]) {
this.words[word] = Object.create(null);
}
2022-08-11 18:28:15 +03:00
2022-08-26 15:08:36 +03:00
if (page._id) {
if (!this.words[word][page._id]) {
this.words[word][page._id] = 0;
}
2022-08-11 18:28:15 +03:00
2022-08-26 15:08:36 +03:00
/**
* Add page id to the list of pages with this word
*/
this.words[word][page._id] += blockRatio;
}
2022-08-11 18:28:15 +03:00
}
2022-08-26 15:08:36 +03:00
}
}
}
2022-08-11 18:28:15 +03:00
2022-08-26 15:08:36 +03:00
public async query(searchString: string) {
try {
await this.init();
} catch (error) {
console.log(error);
throw error;
}
2022-08-15 18:53:57 +03:00
2022-08-26 15:08:36 +03:00
const searchWords = searchString
.trim()
.toLowerCase()
.replace(/\s+/g, ' ')
.split(' ');
2022-08-23 14:12:15 +03:00
2022-08-26 15:08:36 +03:00
const goodPages = (await this.getPagesByWords(searchWords))
.slice(0, 10);
2022-08-11 18:28:15 +03:00
2022-08-26 15:08:36 +03:00
const returnPages: {[key: string]: string|number, ratio: number}[] = [];
2022-08-11 18:28:15 +03:00
2022-08-26 15:08:36 +03:00
goodPages.forEach(({ pageId, ratio }) => {
const page = this.pages.filter(page => page._id === pageId).pop();
2022-08-11 18:28:15 +03:00
2022-08-26 15:08:36 +03:00
if (!page) {
return;
}
2022-08-15 18:53:57 +03:00
2022-08-26 15:08:36 +03:00
let section = '';
2022-08-11 18:28:15 +03:00
2022-08-26 15:08:36 +03:00
page.body.blocks.forEach((block: any) => {
let koef = 0;
2022-08-11 18:28:15 +03:00
2022-08-26 15:08:36 +03:00
let blockContent = this.getCleanTextFromBlock(block);
2022-08-11 18:28:15 +03:00
2022-08-26 15:08:36 +03:00
let shortBody = blockContent;
2022-08-11 18:28:15 +03:00
2022-08-26 15:08:36 +03:00
if (block.type === 'header') {
section = blockContent;
}
2022-08-11 18:28:15 +03:00
2022-08-26 15:08:36 +03:00
searchWords.forEach(word => {
if (blockContent.toLowerCase().indexOf(word) !== -1) {
koef += 1;
}
})
2022-08-11 18:28:15 +03:00
2022-08-26 15:08:36 +03:00
shortBody = this.highlightSubstring(shortBody, searchWords);
2022-08-11 18:28:15 +03:00
2022-08-26 15:08:36 +03:00
if (koef > 0) {
returnPages.push({
...page,
shortBody,
anchor: urlify(section),
section,
ratio: ratio * koef,
})
2022-08-11 18:28:15 +03:00
}
2022-08-26 15:08:36 +03:00
});
2022-08-11 18:28:15 +03:00
});
2022-08-26 15:08:36 +03:00
// // --------- START test ---------
// //
// const uniqWords = [...new Set(pagesWords.flatMap(page => page.words))].sort();
// //
// // uniqWords.forEach(word => {
// // console.log(word);
// // })
//
// // --------- END test ---------
2022-08-11 18:28:15 +03:00
2022-08-26 15:08:36 +03:00
// console.log('RESULT')
// returnPages.forEach(page => {
// console.log(page);
// });
//
// return {
// suggestions: uniqWords.filter(word => word.indexOf(searchWords.slice(-1)[0]) === 0),
// pages: returnPages
// }
2022-08-15 18:53:57 +03:00
2022-08-23 14:12:15 +03:00
2022-08-26 15:08:36 +03:00
return {
suggestions: [],
pages: returnPages
.sort((a, b) => b.ratio - a.ratio)
.slice(0, 15)
}
}
private async getPages() {
return await Pages.getAll();
}
2022-08-23 14:12:15 +03:00
2022-08-26 15:08:36 +03:00
private async getPagesByWords(words: string[]) {
const pagesList: {[key: string]: number} = {};
2022-08-23 14:12:15 +03:00
2022-08-26 15:08:36 +03:00
Object.keys(this.words)
.filter(word => words.indexOf(word) !== -1)
.forEach(word => {
Object.keys(this.words[word])
.forEach(pageId => {
if (!pagesList[pageId]) {
pagesList[pageId] = 0;
2022-08-23 14:12:15 +03:00
}
2022-08-15 18:53:57 +03:00
2022-08-26 15:08:36 +03:00
pagesList[pageId] += this.words[word][pageId]
})
})
2022-08-15 18:53:57 +03:00
2022-08-26 15:08:36 +03:00
const sortedPagesList = Object.keys(pagesList)
.map(pageId => {
2022-08-15 18:53:57 +03:00
return {
2022-08-26 15:08:36 +03:00
pageId,
ratio: pagesList[pageId]
}
})
.sort((a, b) => b.ratio - a.ratio);
2022-08-15 18:53:57 +03:00
2022-08-26 15:08:36 +03:00
return sortedPagesList;
}
2022-08-15 18:53:57 +03:00
2022-08-26 15:08:36 +03:00
private getUnique(elements: string[]) {
return [...new Set(elements)].sort();
}
2022-08-15 18:53:57 +03:00
2022-08-26 15:08:36 +03:00
private getBlockRatio(block: any) {
switch (block.type) {
case 'header':
return 6;
case 'paragraph':
return 2;
case 'list':
return 1;
default:
return 0;
}
}
2022-08-11 18:28:15 +03:00
2022-08-26 15:08:36 +03:00
private getCleanTextFromBlock(block: any): string {
let blockContent = '';
2022-08-11 18:28:15 +03:00
2022-08-26 15:08:36 +03:00
switch (block.type) {
case 'header':
blockContent = block.data.text;
break;
2022-08-11 18:28:15 +03:00
2022-08-26 15:08:36 +03:00
case 'paragraph':
blockContent = block.data.text
break;
2022-08-11 18:28:15 +03:00
2022-08-26 15:08:36 +03:00
case 'list':
blockContent = block.data.items.join(' ');
break;
2022-08-11 18:28:15 +03:00
2022-08-26 15:08:36 +03:00
default:
return blockContent;
2022-08-11 18:28:15 +03:00
}
2022-08-26 15:08:36 +03:00
blockContent = this.removeHTMLTags(blockContent);
blockContent = this.removeHTMLSpecialCharacters(blockContent);
return blockContent;
2022-08-11 18:28:15 +03:00
}
2022-08-26 15:08:36 +03:00
private removeHTMLTags(text: string) {
return text.replace(/<[^>]*>?/gm, '');
}
2022-08-11 18:28:15 +03:00
2022-08-26 15:08:36 +03:00
private removeHTMLSpecialCharacters(text: string) {
return text.replace(/&[^;]*;?/gm, '');
2022-08-11 18:28:15 +03:00
}
2022-08-23 14:12:15 +03:00
2022-08-26 15:08:36 +03:00
private splitTextToWords(text: string): string[] {
return text
// lowercase all words
.toLowerCase()
// remove punctuation
.replace(/[.,;:]/gi, '')
// left only letters (+cyrillic) and numbers
.replace(/[^a-zа-я0-9]/gi, ' ')
// remove multiple spaces
.replace(/\s+/g, ' ')
// split to words by spaces
.split(' ')
// ignore words shorter than 3 chars
.filter(word => word.length >= 3);
}
/**
* Highlight substring in string with a span wrapper
*/
private highlightSubstring(text: string, words: string|string[]) {
if (typeof words === 'string') {
words = [words];
}
const wordRegExp = new RegExp(words.join('|'), "ig");
const CLASS_STYLE = 'search-word';
2022-08-23 14:12:15 +03:00
2022-08-26 15:08:36 +03:00
return text.replace(wordRegExp, `<span class="${CLASS_STYLE}">$&</span>`);
2022-08-23 14:12:15 +03:00
}
2022-08-11 18:28:15 +03:00
}
export default Search;