MySQL - word frequency count on long textual field - mysql

I have a MySQL table, where one of the fields contains a textual description (~5-200 words).
For example Reviews:
Rev_id Place_id Stars Category Text
1 12 3 Food Nice food but a bad dirty place.
2 31 4 Sport Not bad, they have everything.
3 55 1 Bar Poor place,bad audience.
I'd like to make some word count analysis, such as general word frequency count (how many times each of the words has appeared) or top-K words per Category.
In the example:
word count
bad 3
place 2
...
Is there a way to do it solely in MySQL without involving programming languages?

My logic for this question is: extract all words and count them!
So, create a table like your stored data:
CREATE TABLE `tbltest` (
`Rev_id` int(11) NOT NULL AUTO_INCREMENT,
`place_id` int(11) DEFAULT NULL,
`Stars` int(11) DEFAULT NULL,
`Category` varchar(45) DEFAULT NULL,
`Text` varchar(255) DEFAULT NULL,
PRIMARY KEY (`Rev_id`),
UNIQUE KEY `id_UNIQUE` (`Rev_id`)
) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8;
And creating a table for words:
CREATE TABLE `counting` (
`word` varchar(45) NOT NULL,
`counts` int(11) DEFAULT NULL,
PRIMARY KEY (`word`),
UNIQUE KEY `word_UNIQUE` (`word`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
Now, create the MySQL Stored Procedure for splitting sentences and counting words:
drop procedure if exists sentence_words;
delimiter #
create procedure sentence_words(IN Cat VARCHAR(45))
begin
declare w_max int unsigned default 1;
declare w_counter int unsigned default 0;
declare done int unsigned default 0;
declare sentence varchar(255) default null;
declare cur cursor for select `text` from `tbltest` where `Category` = Cat;
declare continue handler for not found set done=1;
set done=0;
open cur;
myloop: loop
fetch cur into sentence;
if done = 1 then leave myloop; end if;
-- refine sentence!
set sentence = replace(replace(replace(replace(
sentence
,'.',' '),'!',' '),',',' '),';',' ');
set sentence = replace(trim(sentence),' ',' ');
set w_max = length(sentence)-length(replace(sentence,' ',''))+1;
start transaction;
while w_counter < w_max do
insert into `counting`(counts,word) values
(1, substring_index( substring_index(
sentence,' ',w_counter+1) ,' ',-1)
)
ON DUPLICATE KEY UPDATE counts=counts+1;
set w_counter=w_counter+1;
end while;
commit;
end loop;
close cur;
end #
delimiter ;
Finally, you can call the procedure and find words and counts in counting table. If you need each category word counts separated, remember to truncate or backup counting table before calling procedure for each Category.
truncate `counting`;
call sentence_words('Bar');
select * from `counting` order by counts desc; -- ? where length(word)>2
-- words | counts --
'audience', '1'
'bad', '1'
'place', '1'
'Poor', '1'

Related

Column count doesn't match with the value count in row 1

I am trying to populate a mqsql table with random values. I created a stored procedure for this but it gives me the error "column count doesn't match with the value count in row 1" when I execute it.
Below is what I have tried
The stored procedure
CREATE DEFINER=`root`#`localhost` PROCEDURE `GenerateManagerData`()
BEGIN
DECLARE i INT DEFAULT 1;
WHILE i <= 100 DO
INSERT INTO projectManager(id,name,email,contact_number) VALUES (i+ 'M', CAST(i AS CHAR(10)), 'e'+ CAST(i AS CHAR(10))+ '#gmail.com', 'TP' + CAST(i AS CHAR(10)));
SET i = i + 1;
END WHILE;
END
This is the table I have created
CREATE TABLE `projectmanager` (
`id` int(10) NOT NULL AUTO_INCREMENT,
`name` varchar(100) DEFAULT NULL,
`email` varchar(100) DEFAULT NULL,
`contact_number` varchar(20) DEFAULT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `email_UNIQUE` (`email`)
);
I have given all the columns in the stored procedure. Please help me with this.
It was an issue with the CAST funation. Using CONCAT() fixed the issue,
CREATE DEFINER=`root`#`localhost` PROCEDURE `GenerateManagerData`()
BEGIN
DECLARE i INT DEFAULT 38;
WHILE i <= 100 DO
INSERT INTO projectManager(name,email,contact_number) VALUES (CONCAT('M', i), CONCAT('e', i, '#gmail.com'), CONCAT('TP', i));
SET i = i + 1;
END WHILE;
END

MySQL Non Sequential ID

Challenge:
Create a method to set "auto_increment" values for tables in a non-sequential way.
The goal is to override the "auto_increment" mechanism and allow the function "LAST_INSERT_ID()" to continue working as expected (returning an INT), so that no changes are needed in software side.
My Solution
The method I found is based on an auxiliary table (unique_id), that stores values available to be assigned. Values are then selected randomly, and removed from the tables as used. When the table gets empty, a new set of ID's is created.
This example is working as expected, but with one problem.
Tables for the demo:
CREATE TABLE `unique_id` (
`id` INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
PRIMARY KEY (`id`)
)
COLLATE='latin1_swedish_ci'
ENGINE=InnoDB
AUTO_INCREMENT=100;
CREATE TABLE `test_unique_id` (
`id` INT(10) UNSIGNED NOT NULL AUTO_INCREMENT,
`name` VARCHAR(50) NOT NULL DEFAULT '0',
PRIMARY KEY (`id`)
)
COLLATE='latin1_swedish_ci'
ENGINE=InnoDB
AUTO_INCREMENT=1;
Defined a stored procedure and a function:
DELIMITER $$
DROP PROCEDURE IF EXISTS `UNIQUE_ID_REFILL`$$
CREATE PROCEDURE UNIQUE_ID_REFILL()
BEGIN
DECLARE a INT Default 0 ;
simple_loop: LOOP
SET a=a+1;
INSERT INTO unique_id (id) values(null);
IF a=100 THEN
LEAVE simple_loop;
END IF;
END LOOP simple_loop;
END $$
DROP FUNCTION IF EXISTS `UNIQUE_ID_GET`$$
CREATE FUNCTION UNIQUE_ID_GET()
RETURNS INT(11)
MODIFIES SQL DATA
BEGIN
DECLARE new_id INT(11);
DECLARE unique_id_count INT(11);
SET new_id = 0;
SELECT COUNT(*) INTO unique_id_count FROM unique_id;
IF unique_id_count=0 THEN
CALL UNIQUE_ID_REFILL();
END IF;
SELECT id INTO new_id FROM unique_id ORDER BY RAND() LIMIT 1;
DELETE FROM unique_id WHERE id = new_id;
RETURN new_id;
END $$
Created a Trigger on the destination table (test_unique_id):
CREATE TRIGGER test_unique_id__unique_id BEFORE INSERT ON test_unique_id
FOR EACH ROW
SET NEW.id = UNIQUE_ID_GET();
The solution is getting the random ID's as expected:
INSERT INTO test_unique_id(name) VALUES ('A'),('B'),('C');
Creates the rows:
id name
154 'A'
129 'B'
173 'C'
The Problem
The main problem is that LAST_INSERT_ID() stops working... and the software side is broken:
SELECT LAST_INSERT_ID();
0
Any ideas on how to solve this problem? or any other different approach to the challenge?
Thank you very much.

MySQL optimize comparing function

I'm working on an Android program that introduces in a table approximately 15000 integer values(somewhere between 350-500 lines with 32 columns). In the DB I also have other similar values. This 15000 values that I'm talking about represent a processed image, so basically I want to compare the similarity of two images. Now, when I try to compare the values of two images(I'm comparing value by value and count the equal ones), only the data writing process takes about 7 minutes, which is way too long(I want to be able to write and compare at least 5 images in that time). I know that usually you don't work with this kind of things directly in the DB, but do you think that there is anything that I can do, or is it necessary to do this comparison on the server? The values returned by the descriptor came as line elements separated by ',' and each line is separated by ';'. I take each returned element and save it in a tables column. Here is my code:
Split function:
CREATE DEFINER=`root`#`localhost` FUNCTION `strSplit`(textIn longtext, delim varchar(12), count int) RETURNS int(11)
BEGIN
declare splitString INT(11);
SET splitString = replace(substring(substring_index(textIn, delim, count), length(substring_index(textIn, delim, count - 1)) + 1), delim, '');
RETURN splitString;
END
The function that creates the table:
CREATE TABLE IF NOT EXISTS `myguide`.`objectlocation` (
`ObjectLocationId` INT(11) NOT NULL AUTO_INCREMENT,
`ValueObject` LONGTEXT NOT NULL,
`DescriptorSize` INT(11) NOT NULL,
`DescriptionObject` VARCHAR(45) NOT NULL,
`DataInsert` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
`InsertBy` VARCHAR(45) NULL DEFAULT NULL,
PRIMARY KEY (`ObjectLocationId`))
ENGINE = InnoDB
AUTO_INCREMENT = 2
DEFAULT CHARACTER SET = utf8
And this is the code that does the insert part:
CREATE DEFINER=`root`#`localhost` PROCEDURE `myguide_sp_info_imageId`(descriptorIn longtext, sizeDescriptor INT)
BEGIN
declare sizeImagesTable INT DEFAULT (select count(*) from objectLocation);
declare descriptorSizeImage INT;
declare descriptor INT;
declare sizeDescriptorImage INT DEFAULT sizeDescriptor;
declare contorInsertImage INT default 1;
declare descriptorForSplit longtext;
declare descriptorImageSaved longtext;
declare descriptorForSplitImageSaved longtext;
/* check if table exist, drop*/
DROP TEMPORARY TABLE IF EXISTS backupObjectLocation;
/* Create temporar table for store info about objectLocation*/
CREATE TEMPORARY TABLE backupObjectLocation (
id INT NOT NULL PRIMARY KEY AUTO_INCREMENT,
idImage int NOT NULL,
descriptorSaved longtext not null,
sizeDescriptorSaved float not null
);
/* check if table exist, drop*/
DROP TEMPORARY TABLE IF EXISTS processImage;
/* Create temporar table for store info about objectLocation*/
CREATE TEMPORARY TABLE processImage (
id INT NOT NULL PRIMARY KEY AUTO_INCREMENT,
descriptorSaved varchar(255) ,
descriptorReceived varchar(255)
);
SET descriptorImageSaved = RTRIM(descriptorIn);
SET descriptorForSplit = REPLACE(descriptorImageSaved, ';', ',');
INSERT INTO backupObjectLocation (idImage, descriptorSaved, sizeDescriptorSaved)
SELECT ObjectLocationId, ValueObject, DescriptorSize FROM objectLocation;
loop_insertDescriptorImage: LOOP
if contorInsertImage > sizeDescriptorImage then
leave loop_insertDescriptorImage;
end if;
SET descriptor = strSplit(descriptorForSplit, ',', contorInsertImage);
INSERT INTO processImage (descriptorReceived) VALUES (descriptor);
SET contorInsertImage = contorInsertImage + 1;
ITERATE loop_insertDescriptorImage;
end LOOP;
loop_table: LOOP
if sizeImagesTable > 1 then
leave loop_table;
end if;
SET descriptorSizeImage = (SELECT sizeDescriptorSaved from backupObjectLocation where id = sizeImagesTable);
loop_image: LOOP
if descriptorSizeImage > 1 then
leave loop_image;
end if;
SET descriptorImageSaved = (SELECT descriptorSaved from backupObjectLocation where id = sizeImagesTable);
SET descriptorForSplitImageSaved = REPLACE(descriptorImageSaved, ';', ',');
SET descriptorSizeImage = descriptorSizeImage + 1;
ITERATE loop_image;
end LOOP;
SET sizeImagesTable = sizeImagesTable + 1;
ITERATE loop_table;
end LOOP;
select descriptorImageSaved;
select * from backupObjectLocation;
select * from processImage;
END
Please help me find a solution.

MySQL CURSOR loop adds extra pass in stored procedure

I'm having a problem with a cursor fetch loop in a mysql stored procedure. My stored procedure runs a reordering process which works just fine until the last record of the sort where the order numbering skips a single digit. For example, if I have 10 records and the sort ordering procedure starts at 1, all digits from 1 to 10 should be shown in the resulting records. However, my stored procedure skips the last count, 10 in the case above, and renumbers the final record 11, so the count goes from 9 to 11. This is the case regardless of the number of records involved.
The procedure's logic is fairly simple:
I have a table that holds product type records, with a sort_order column that is used to reorder the records based on usage during a regular batch cycle.
CREATE TABLE `PRODUCT_TYPE` (
`PRODUCT_TYPE_ID` int(11) NOT NULL AUTO_INCREMENT,
`PRODUCT_TYPE_NAME` varchar(45) NOT NULL,
`PRODUCT_CATEGORY_ID` int(11) DEFAULT NULL,
`LIFESPAN_MONTHS` int(11) DEFAULT NULL,
`USER_ID` int(11) DEFAULT NULL,
`UPDATED_BY` int(11) DEFAULT NULL,
`UPDATED_DATE` datetime DEFAULT NULL,
`CREATED_DATE` datetime DEFAULT NULL,
`CREATED_BY` int(11) DEFAULT NULL,
`REVIEWED` bit(1) NOT NULL DEFAULT b'0',
`SORT_ORDER` int(11) NOT NULL DEFAULT '0',
PRIMARY KEY (`PRODUCT_TYPE_ID`),
KEY `fk_PRODUCT_TYPE_PRODUCT_CATEGORY1_idx` (`PRODUCT_CATEGORY_ID`),
KEY `fk_PRODUCT_TYPE_USERS1_idx` (`USER_ID`),
KEY `fk_PRODUCT_TYPE_USERS2_idx` (`UPDATED_BY`),
KEY `fk_PRODUCT_TYPE_USERS3_idx` (`CREATED_BY`),
CONSTRAINT `fk_PRODUCT_TYPE_PRODUCT_CATEGORY1` FOREIGN KEY (`PRODUCT_CATEGORY_ID`) REFERENCES `PRODUCT_CATEGORY` (`PRODUCT_CATEGORY_ID`) ON DELETE NO ACTION ON UPDATE NO ACTION,
CONSTRAINT `fk_PRODUCT_TYPE_USERS1` FOREIGN KEY (`USER_ID`) REFERENCES `USERS` (`USER_ID`) ON DELETE NO ACTION ON UPDATE NO ACTION,
CONSTRAINT `fk_PRODUCT_TYPE_USERS2` FOREIGN KEY (`UPDATED_BY`) REFERENCES `USERS` (`USER_ID`) ON DELETE NO ACTION ON UPDATE NO ACTION,
CONSTRAINT `fk_PRODUCT_TYPE_USERS3` FOREIGN KEY (`CREATED_BY`) REFERENCES `USERS` (`USER_ID`) ON DELETE NO ACTION ON UPDATE NO ACTION
) ENGINE=InnoDB AUTO_INCREMENT=61 DEFAULT CHARSET=latin1;
I run the following stored procedure on a nightly basis to reorder the product type records based on the number of references to each type using the sort_order column to record the order.
DELIMITER $$
CREATE DEFINER=`root`#`localhost` PROCEDURE `REORDER_MANUFACTURERS`()
BEGIN
DECLARE DONE BOOL;
DECLARE MID INT;
DECLARE MNAME VARCHAR(255);
DECLARE USES INT;
DECLARE SORT_ORDER_COUNTER INT;
DECLARE CUR CURSOR FOR SELECT M.MANUFACTURER_ID, M.MANUFACTURER_NAME, COUNT(U.UNIT_ID) AS USES
FROM MANUFACTURERS M LEFT JOIN mydb.UNITS U ON M.MANUFACTURER_ID = U.MANUFACTURER_ID
GROUP BY M.MANUFACTURER_ID, M.MANUFACTURER_NAME
ORDER BY USES DESC, MANUFACTURER_NAME;
DECLARE CONTINUE HANDLER FOR NOT FOUND SET DONE = TRUE;
SET SORT_ORDER_COUNTER = 0;
OPEN CUR;
READ_LOOP: LOOP
FETCH CUR INTO MID, MNAME, USES;
UPDATE MANUFACTURERS SET SORT_ORDER = SORT_ORDER_COUNTER WHERE MANUFACTURER_ID = MID;
IF DONE THEN
LEAVE READ_LOOP;
END IF;
SET SORT_ORDER_COUNTER = SORT_ORDER_COUNTER + 1;
END LOOP;
CLOSE CUR;
END
For the life of me, I can't find a problem with this logic that would cause the count to skip a beat. Any help would be appreciated.
Change:
FETCH CUR INTO MID, MNAME, USES;
UPDATE MANUFACTURERS SET SORT_ORDER = SORT_ORDER_COUNTER
WHERE MANUFACTURER_ID = MID;
IF DONE THEN
LEAVE READ_LOOP;
END IF;
To:
FETCH CUR INTO MID, MNAME, USES;
IF DONE THEN
LEAVE READ_LOOP;
END IF;
UPDATE MANUFACTURERS SET SORT_ORDER = SORT_ORDER_COUNTER
WHERE MANUFACTURER_ID = MID;
It is because:
If you FETCH past the last row in the result set, the values of the target fields or variables are indeterminate and the NOTFOUND attribute returns TRUE.
Refer to: (This is on Oracle cursors, but applicable to others as well):
Oracle: Fetch Statement

Generate auto incremented id for BPM application

Within a BPM web application, I have a field for an invoice # on a particular page but I need for it to be auto generated every time a user attaches an invoice and views that page. That number must be unique and preferably auto-incremented. A value for the invoice # field can be displayed by querying from a table from an external MYSQL database. So every time a user lands on that particular page, a SELECT query statement can be fired.
On MYSQL end, how would I set this up? So basically, I would like to setup a query for that invoice # field where it will for run a query for example,
SELECT invoice_num FROM invoice_generator
and every time this query runs, it would return the next incremented number.
You can use mysql trigger concept here....
I have added one example here...
It will be very usefull for u (see this link also :http://www.freemindsystems.com/mysql-triggers-a-practical-example/)
CREATE TABLE `products` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`name` varchar(50) NOT NULL DEFAULT '',
`price` int(20) NOT NULL DEFAULT '0',
`other` varchar(50) DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `products_name_idx` (`name`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
CREATE TABLE `freetags` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`tag` varchar(50) NOT NULL DEFAULT '',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
CREATE TABLE `freetagged_objects` (
`tag_id` int(20) NOT NULL DEFAULT '0',
`object_id` int(20) NOT NULL DEFAULT '0',
`tagged_on` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
`module` varchar(50) NOT NULL DEFAULT '',
PRIMARY KEY (`tag_id`, `object_id`),
KEY `freetagged_objects_tag_id_object_id_idx` (`tag_id`, `object_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8
INSERT_PRODUCTS_TAGS
DELIMITER ||
DROP TRIGGER IF EXISTS insert_products_tags;
||
DELIMITER ##
CREATE TRIGGER insert_products_tags AFTER INSERT ON products
FOR EACH ROW
BEGIN
DECLARE current_id integer;
DECLARE tag_id integer;
DECLARE next integer;
DECLARE tag_field varchar(255);
DECLARE next_sep integer;
DECLARE current_tag varchar(255);
DECLARE right_tag varchar(255);
-- We use the field other as comma-separated tag_field
SET tag_field = NEW.other;
-- Check for empty tags
IF (CHAR_LENGTH(tag_field) <> 0) THEN
-- Loop until no more ocurrencies
set next = 1;
WHILE next = 1 DO
-- Find possition of the next ","
SELECT INSTR(tag_field, ',') INTO next_sep;
IF (next_sep > 0) THEN
SELECT SUBSTR(tag_field, 1, next_sep - 1) INTO current_tag;
SELECT SUBSTR(tag_field, next_sep + 1, CHAR_LENGTH(tag_field)) INTO right_tag;
set tag_field = right_tag;
ELSE
set next = 0;
set current_tag = tag_field;
END IF;
-- Drop spaces between comas
SELECT TRIM(current_tag) INTO current_tag;
-- Insert the tag if not already present
IF (NOT EXISTS (SELECT tag FROM freetags WHERE tag = current_tag)) THEN
-- Insert the tag
INSERT INTO freetags (tag) values (current_tag);
SELECT LAST_INSERT_ID() INTO tag_id;
ELSE
-- Or get the id
SELECT id FROM freetags WHERE tag = current_tag INTO tag_id;
END IF;
-- Link the object tagged with the tag
INSERT INTO freetagged_objects
(tag_id, object_id, module)
values
(tag_id, NEW.id, 'products');
END WHILE;
END IF;
END;
##
Now If you execute an insert on products table:
INSERT INTO PRODUCTS
(name, price, other)
values
("product1", 2, "tag1, tag2,tag3 , tag 4");