我对这个问题的逻辑是:提取所有单词并计算它们!
因此,创建一个类似于您存储的数据的表:
CREATE TABLE `tbltest` (
`Rev_id` int(11) NOT NULL AUTO_INCREMENT,
`place_id` int(11) DEFAULT NULL,
`Stars` int(11) DEFAULT NULL,
`Category` varchar(45) DEFAULT NULL,
`Text` varchar(255) DEFAULT NULL,
PRIMARY KEY (`Rev_id`),
UNIQUE KEY `id_UNIQUE` (`Rev_id`)
) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8;
并为单词创建一个表格:
CREATE TABLE `counting` (
`word` varchar(45) NOT NULL,
`counts` int(11) DEFAULT NULL,
PRIMARY KEY (`word`),
UNIQUE KEY `word_UNIQUE` (`word`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
现在,创建MySQL Stored Procedure 用于拆分句子和计算单词:
drop procedure if exists sentence_words;
delimiter #
create procedure sentence_words(IN Cat VARCHAR(45))
begin
declare w_max int unsigned default 1;
declare w_counter int unsigned default 0;
declare done int unsigned default 0;
declare sentence varchar(255) default null;
declare cur cursor for select `text` from `tbltest` where `Category` = Cat;
declare continue handler for not found set done=1;
set done=0;
open cur;
myloop: loop
fetch cur into sentence;
if done = 1 then leave myloop; end if;
-- refine sentence!
set sentence = replace(replace(replace(replace(
sentence
,'.',' '),'!',' '),',',' '),';',' ');
set sentence = replace(trim(sentence),' ',' ');
set w_max = length(sentence)-length(replace(sentence,' ',''))+1;
start transaction;
while w_counter < w_max do
insert into `counting`(counts,word) values
(1, substring_index( substring_index(
sentence,' ',w_counter+1) ,' ',-1)
)
ON DUPLICATE KEY UPDATE counts=counts+1;
set w_counter=w_counter+1;
end while;
commit;
end loop;
close cur;
end #
delimiter ;
最后,您可以调用该过程并在counting 表中查找单词和计数。如果您需要将每个类别的字数分开,请记住在为每个类别调用过程之前将truncate 或备份counting 表。
truncate `counting`;
call sentence_words('Bar');
select * from `counting` order by counts desc; -- ? where length(word)>2
-- words | counts --
'audience', '1'
'bad', '1'
'place', '1'
'Poor', '1'