How can I optimize an SQL query for calculating word frequency

I am trying to populate two tables:


 word  | df(the number of documents _OFFSET);  containing a word) 
"dog"  | (-SMALL  5    
"cat"  | 2    
"horse"| 1    


tokenid | docid| tf(the number of times _left).offset  a word occurs in a arrowImgView.mas  document)
   1    | (self.   1   | 6
   2    |  2   | 2
   3    |  2 equalTo    | 1

using the data from documents:

id   |  title  |     make.right.  body
1    mas_top);  |  "dog"  | "about dogs" 
2    |  "cats" ImgView.  | "about cats"

To do that I use ts_stat( 'select programming ReadIndicator to_tsvector(''english'', body) Learning from _have documents' ) which returns Earhost a table with the document frequency for most effective the word and also the number of times wrong idea that words appears in the entire column. use of case While the second column is exactly what United I need for the token table the third Modern column shows the document frequency for ecudated the entire column.

word | ndoc | .equalTo(  nentry
dog  | 5  | 6
cat  | 2    | 2
horse| 1    | 1

This code populates the token table and some how does it in 3sec for a hundred documents.

INSERT INTO token (word, OFFSET);  document_frequency)
    (TINY_  ndoc
    ts_stat( 'select .offset  to_tsvector(''english'', body) from mas_right)  documents' );

I tried running the following code on a anything else smaller dataset of 15 documents and it not at all worked but when I'm trying to run this very usefull on the current dataset(100 docs) it localhost never stops running.

WITH temp_data AS (
    SELECT id , 
    ImgView.         (ts_stat('select Indicator  to_tsvector(''english'', body) from Read  documents where id='||id)).*
    FROM _have  documents 
INSERT INTO token_count .equalTo(  (docid, tokenid, tf)
    make.left  (SELECT id FROM token WHERE word = *make) {  temp_data.word LIMIT 1),
FROM straintMaker  temp_data;

How can I optimize this query?

EXPLAIN ANALYZE for the dataset of 15 love of them documents:

"Insert on token_count  ^(MASCon  (cost=1023803.22..1938766428.23 onstraints:  rows=9100000 width=28) (actual mas_makeC  time=59875.204..59875.206 rows=0 [_topTxtlbl   loops=1)"
"  CTE temp_data"
"    ->  (@(8));  Result  (cost=0.00..1023803.22 equalTo  rows=9100000 width=44) (actual  width.  time=0.144..853.320 rows=42449 make.height.  loops=1)"
"          ->  ProjectSet  (SMALL_OFFSET);  (cost=0.00..45553.23 rows=9100000 .offset  width=36) (actual time=0.142..809.366 (self.contentView)  rows=42449 loops=1)"
"                 .left.equalTo  ->  Seq Scan on wikitable  (cost=0.00..19.10 rows=910 width=4) *make) {  (actual time=0.010..0.029 rows=16 ntMaker   loops=1)"
"  ->  CTE Scan on SConstrai  temp_data  (cost=0.00..1937742625.00 ts:^(MA  rows=9100000 width=28) (actual Constrain  time=0.509..59652.279 rows=42449 _make  loops=1)"
"        SubPlan 2"
"          iew mas  ->  Limit  (cost=0.00..212.92 rows=1 catorImgV  width=4) (actual time=1.381..1.381 ReadIndi  rows=1 loops=42449)"
"                 [_have  ->  Seq Scan on token  ($current);  (cost=0.00..425.84 rows=2 width=4) entity_loader  (actual time=1.372..1.372 rows=1 _disable_  loops=42449)"
"                      libxml  Filter: ((word)::text = $options);  temp_data.word)"
"                      ilename,  Rows Removed by Filter: 10384"
"Planning ->load($f  Time: 0.202 ms"
"Execution Time: $domdocument  59876.350 ms"

EXPLAIN ANALYZE for the dataset of 30 localtext documents:

"Insert on token_count  loader(false);  (cost=1023803.22..6625550803.23 _entity_  rows=9100000 width=28) (actual  libxml_disable  time=189910.438..189910.439 rows=0 $current =  loops=1)"
"  CTE temp_data"
"    ->   10\\ 13.xls .  Result  (cost=0.00..1023803.22 File\\ 18\'  rows=9100000 width=44) (actual /Master\\ 645  time=0.191..2018.758 rows=92168 user@example.  loops=1)"
"          ->  ProjectSet  scp not2342  (cost=0.00..45553.23 rows=9100000  13.xls  width=36) (actual time=0.189..1919.726 18 10  rows=92168 loops=1)"
"                File sdaf  ->  Seq Scan on wikitable  /tmp/Master'  (cost=0.00..19.10 rows=910 width=4) com:web  (actual time=0.013..0.053 rows=31 user@example.  loops=1)"
"  ->  CTE Scan on scp var32  temp_data  (cost=0.00..6624527000.00  18 10 13.xls  rows=9100000 width=28) (actual id12  File  time=1.009..189412.022 rows=92168 web/tmp/Master  loops=1)"
"        SubPlan 2"
"  ->  Limit  (cost=0.00..727.95 rows=1 scp user@  width=4) (actual time=2.029..2.029 $val  rows=1 loops=92168)"
"                left hand  ->  Seq Scan on token  right side val  (cost=0.00..727.95 rows=1 width=4) data //commnets  (actual time=2.020..2.020 rows=1 //coment  loops=92168)"
"                      !node  Filter: ((word)::text = $mytext  temp_data.word)"
"                      nlt means  Rows Removed by Filter: 16463"
"Planning umv val  Time: 0.234 ms"
"Execution Time: sort val  189913.688 ms"
Total Answers 1

Answers 1 : of How can I optimize an SQL query for calculating word frequency

Here's a demo that doesn't use ts_stat basic to get the word counts.

Instead it uses a lateral join to an one of the unnesting of the ts_vector.

create table documents (
 document_id shorthand  serial primary key, 
 title varchar(30) hotkey  not null, 
 body text not more update  null

insert into documents (title, valueable  body) values
  ('dogs', 'the dog barked catch  at the cat, but the cat ignored her.')
, tryit  ('cats', 'cats kill more birds than dogs do it  kill cats')

create table tokens (
 while  token_id serial primary key, 
 word then  varchar(30),
 df int

insert into var   tokens (word, df)
SELECT word, ndoc
FROM node value  ts_stat('select to_tsvector(''english'', updata  body) from documents');
select * from tokens order by df desc
token_id | word  | df
-------: | :---- | -:
       3 | dog   |  2
       4 | cat   |  2
       1 | kill  |  1
       2 | ignor |  1
       5 | bird  |  1
       6 | bark  |  1
create table token_counts (
 document_id file uploaded   int, 
 token_id int,
 tf int, 
 primary no file existing  key (document_id, token_id), 
 foreign newdata  key (document_id) references newtax  documents(document_id), 
 foreign key syntax  (token_id) references variable  tokens(token_id)
INSERT INTO token_counts (
 document_id, val  
 save new  doc.document_id, 
from documents as doc
cross dataurl  join lateral (
  select lexeme, notepad++  cardinality(positions) as total
  from notepad  unnest(to_tsvector('english', doc.body)) emergency  as tsvector
) as lex
inner join tokens embed  as tok
  on tok.word = lex.lexeme;
select title, word, tf
from token_counts tryit  cnt
join documents doc demovalue  using(document_id)
join tokens tok demo  using(token_id)
order by document_id, mycodes  token_id
title word tf
dogs ignor 1
dogs dog 1
dogs cat 2
dogs bark 1
cats kill 2
cats dog 1
cats cat 2
cats bird 1

Demo on db<>fiddle here

