Parse a String Expression into Columns - mysql

I have a string for example 32,21C2L5N8C stored in one field. Now I want to expand this string into as follows:
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,CCCCCCCCCCCCCCCCCCCCCLLNNNNNCCCCCCCC
After getting the above string, I want to count number of commas,C's,L's and N's.
Can some one help me with this please?

you can extract the numbers and none numeric characters then then replicate each character, in SQL Server you can use patindex and replicate functions (explanations are in code):
--table variable for holding extracted numbers and none number characters
declare #t table(id int identity(1,1), num int, nonnum char(1))
declare #str1 varchar(50)='32,21C2L5N8C' -- your current given string
declare #int1 varchar(50)='' --for number
declare #str2 varchar(50)='' --for none numeric characters
declare #result varchar(max)=''
while len(#str1)>1 --for parsing the given string
begin
while (Select PatIndex('%[0-9]%', #str1))=1 --extract number
begin
set #int1=#int1+substring(#str1,1,1)
set #str1=substring(#str1,2,len(#str1)-1)
end
set #str2=substring(#str1,1,1) --extract none numeric character
set #str1=substring(#str1,2,len(#str1)-1)
insert into #t(num,nonnum)values (#int1,#str2)
set #int1=''
set #str2=''
end
select #result=#result+replicate(nonnum,num) from #t
select #result
Output:
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,CCCCCCCCCCCCCCCCCCCCCLLNNNNNCCCCCCCC
Edit 1: if you have characters with no number in front of it inside the given string and you want to print it once you can add an extra while loop in above code:
--table variable for holding extracted numbers and none number characters
declare #t table(id int identity(1,1), num int, nonnum char(1))
declare #str1 varchar(50)='32,21C2L5NC' -- your current given string
declare #int1 varchar(50)='' --for number
declare #str2 varchar(50)='' --for none numeric characters
declare #result varchar(max)=''
while len(#str1)>1 --for parsing the given string
begin
while (Select PatIndex('%[0-9]%', #str1))=1 --extract number
begin
set #int1=#int1+substring(#str1,1,1)
set #str1=substring(#str1,2,len(#str1)-1)
end
set #str2=substring(#str1,1,1) --extract none numeric character
set #str1=substring(#str1,2,len(#str1)-1)
insert into #t(num,nonnum)values (#int1,#str2)
set #int1=''
set #str2=''
while (isnumeric(substring(#str1,1,1))=0 and len(#str1)>=1)
begin
set #str2=substring(#str1,1,1) --extract none numeric character
set #str1=substring(#str1,2,len(#str1)-1)
insert into #t(num,nonnum)values (1,#str2)
set #int1=''
set #str2=''
end
end
select #result=#result+replicate(nonnum,num) from #t
select #result
Output:
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,CCCCCCCCCCCCCCCCCCCCCLLNNNNNC
Edit 2: if you want the number of repeats of each character, just query the #t table variable in above code, I mean at the end of above query say:
select nonnum [char],num [repeat] from #t
Output:
char repeat
, 32
C 21
L 2
N 5
C 1

You could do this by using a Pattern Splitter. Here is one taken from Dwain Camp's article. The function used, PatternSplitCM, is created by Chris Morris.
CREATE FUNCTION [dbo].[PatternSplitCM]
(
#List VARCHAR(8000) = NULL
,#Pattern VARCHAR(50)
) RETURNS TABLE WITH SCHEMABINDING
AS
RETURN
WITH numbers AS (
SELECT TOP(ISNULL(DATALENGTH(#List), 0))
n = ROW_NUMBER() OVER(ORDER BY (SELECT NULL))
FROM
(VALUES (0),(0),(0),(0),(0),(0),(0),(0),(0),(0)) d (n),
(VALUES (0),(0),(0),(0),(0),(0),(0),(0),(0),(0)) e (n),
(VALUES (0),(0),(0),(0),(0),(0),(0),(0),(0),(0)) f (n),
(VALUES (0),(0),(0),(0),(0),(0),(0),(0),(0),(0)) g (n))
SELECT
ItemNumber = ROW_NUMBER() OVER(ORDER BY MIN(n)),
Item = SUBSTRING(#List,MIN(n),1+MAX(n)-MIN(n)),
[Matched]
FROM (
SELECT n, y.[Matched], Grouper = n - ROW_NUMBER() OVER(ORDER BY y.[Matched],n)
FROM numbers
CROSS APPLY (
SELECT [Matched] = CASE WHEN SUBSTRING(#List,n,1) LIKE #Pattern THEN 1 ELSE 0 END
) y
) d
GROUP BY [Matched], Grouper
Using the function above, you would want to split your string using the pattern '[^0-9]', which means not numeric. You would then pivot the result so that the corresponding number and character will be on the same row. After that, you use REPLICATE to generate the strings and concatenate them at then end.
Your final query is:
DECLARE
#String VARCHAR(8000),
#Pattern VARCHAR(50),
#Result VARCHAR(MAX)
SELECT
#String = '32,21C2L5N8C',
#Pattern = '[^0-9]',
#Result = ''
;WITH Cte AS(
SELECT
ID = (s.ItemNumber + 1)/ 2,
Number = MAX(CASE WHEN s.ItemNumber % 2 = 1 THEN s.Item END),
Character = MAX(CASE WHEN s.ItemNumber % 2 = 0 THEN s.Item END)
FROM dbo.[PatternSplitCM](#String, #Pattern) s
GROUP BY (s.ItemNumber + 1)/ 2
)
SELECT #Result = #Result + REPLICATE(Character, Number) FROM Cte ORDER BY ID
SELECT #Result
SQL Fiddle
Here is the step by step explanation:
First, split the given string using the pattern '[^0-9]'.
SELECT * FROM dbo.[PatternSplitCM](#String, #Pattern) s
The result is:
ItemNumber Item Matched
-------------------- ---------- -----------
1 32 0
2 , 1
3 21 0
4 C 1
5 2 0
6 L 1
7 5 0
8 N 1
9 8 0
10 C 1
Second, pivot the result so that the corresponding number and character will be on the same row:
SELECT
ID = (s.ItemNumber + 1)/ 2,
Number = MAX(CASE WHEN s.ItemNumber % 2 = 1 THEN s.Item END),
Character = MAX(CASE WHEN s.ItemNumber % 2 = 0 THEN s.Item END)
FROM dbo.[PatternSplitCM](#String, #Pattern) s
GROUP BY (s.ItemNumber + 1)/ 2
The result is:
ID Number Character
------ ---------- ----------
1 32 ,
2 21 C
3 2 L
4 5 N
5 8 C
Last, use REPLICATE(Number, Character) to generate each string and concatenate them to get the final result:
SELECT #Result = #Result + REPLICATE(Character, Number) FROM Cte ORDER BY ID
SELECT #Result
The result is:
,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,CCCCCCCCCCCCCCCCCCCCCLLNNNNNCCCCCCCC

Related

SQL Query that counts the number of characters match in two text columns

I need to count how many characters are equal in two text columns (same size, in the same table).
For example:
RowNum: Template: Answers:
------- --------- --------
1 ABCDEABCDEABCDE ABCDAABCDBABCDC
2 EDAEDAEDAEDAEDA EDBEDBEDBEDBEDB
SELECT SOME_COUNT_FUNCTION (Template, Answers) should return:
RowNum: Result:
------- -------
1 12
2 10
The database is a MySQL.
Not exactly MySQL, but here's something that works in SQL Server. Maybe it'll translate over.
DROP TABLE IF EXISTS #tmp
CREATE TABLE #tmp (
[RowNum] INT IDENTITY(1,1) PRIMARY KEY,
[Template] NVARCHAR(20),
[Answer] NVARCHAR(20),
[Result] INT
)
INSERT INTO #tmp
VALUES ('ABCDEABCDEABCDE','ABCDAABCDBABCDC', NULL),
('EDAEDAEDAEDAEDA','EDBEDBEDBEDBEDB', NULL)
--SELECT * FROM #tmp
DECLARE #current_template NVARCHAR(50) -- Variable to hold the current template
, #current_answer NVARCHAR(50) -- Variable to hold the current answer
, #template_char CHAR(1) -- Char for template letter
, #answer_char CHAR(1) -- Char for answer letter
, #word_index INT -- Index (position) within each word
, #match_counter INT -- Match counter for each word
, #max_iter INT = (SELECT TOP 1 RowNum FROM #tmp ORDER BY RowNum DESC) -- Max iterations
, #row_idx INT = (SELECT TOP 1 RowNum FROM #tmp) -- Minimum RowNum as initial row index value.
WHILE (#row_idx <= #max_iter)
BEGIN
SET #match_counter = 0 -- Reset match counter for each row
SET #word_index = 1 -- Reset word index for each row
SET #current_template = (SELECT [Template] FROM #tmp WHERE RowNum = #row_idx)
SET #current_answer = (SELECT [Answer] FROM #tmp WHERE RowNum = #row_idx)
WHILE (#word_index <= LEN(#current_template))
BEGIN
SET #template_char = SUBSTRING(#current_template, #word_index, 1)
SET #answer_char = SUBSTRING(#current_answer, #word_index, 1)
IF (#answer_char = #template_char)
BEGIN
SET #match_counter += 1
END
SET #word_index += 1
END
UPDATE #tmp
SET Result = #match_counter
WHERE RowNum = #row_idx
SET #row_idx += 1
END
Get values from the temp table:
SELECT * FROM #tmp
Output:
RowNum Template Answer Result
1 ABCDEABCDEABCDE ABCDAABCDBABCDC 12
2 EDAEDAEDAEDAEDA EDBEDBEDBEDBEDB 10
If you are running MySQL 8.0, you can use a recursive query compare the strings character by character:
with recursive chars as (
select rownum, template, answers, 1 idx, 0 res from mytable
union all
select
rownum,
template,
answers,
idx + 1,
res + ( substr(template, idx, 1) = substr(answers, idx, 1) )
from chars
where idx <= least(char_length(template), char_length(answers))
)
select rownum, max(res) result from chars group by rownum order by rownum
In the CTE (the with clause), the anchor (the query before union all) selects the whole table, then the recursive member (the query after union all) compares the characters and the current position (idx) increments the result (res) if they match, and advances to the next position, until the (smallest) string is exhausted. Then, the outer query just aggregates by rownum.
Demo on DB Fiddle:
rownum | result
-----: | -----:
1 | 12
2 | 10
Please bear in mind that this query will not perform well against a large dataset. Other slighly more efficient solutions exist (typically, using a number table instead of a recursive cte), but basically, as commented by Gordon Linoff, you do want to fix your data structure if you need to run such queries. You should store each character in a separate row, along with its rownum and its index in the string. Materialize the proper data structure, and then you won't need to generate it on the fly in each and every query.

Selecting values with more than one occurrence of a character in SQL

Let me explain my question with an example
Consider the following column of values
City
-------
Chennai
Delhi
Mumbai
Output I want is
City
-------
Chennai
Mumbai
When you look at the values 'Chennai' has two 'N's and 'Mumbai' has two 'M's
What is the query to find the values that satisfy the above said condition
I am using MySQL
You may be able to use some of the logic from here and then filter that way Count all occurances of different characters in a column
Can u try this. If you want you can create function and accepts dynamic value and pass to the corresponding function
IF(LEN('Chennai')-LEN(REPLACE('Chennai', 'N', ''))>1 )
Select 'Chennai'
A possible solution if city names contain only latin characters
SELECT DISTINCT city
FROM table1 c CROSS JOIN
(
SELECT 0 n UNION ALL
SELECT a.N + b.N * 5 + 1 n
FROM
(SELECT 0 AS N UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4) a
,(SELECT 0 AS N UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4) b
ORDER BY n
) n
WHERE CHAR_LENGTH(city) - CHAR_LENGTH(REPLACE(LOWER(city), CHAR(97 + n.n), '')) > 1
Output:
| CITY |
|---------|
| Mumbai |
| Chennai |
Here is SQLFiddle demo
You can use stored procedure for this. Please check my code -
Create table statement -
CREATE TABLE `Cities` (
`City` varchar(100) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=latin1;
Added cities to table and created procedure -
CREATE PROCEDURE `SP_SplitString`()
BEGIN
DECLARE front TEXT DEFAULT NULL;
DECLARE count INT DEFAULT 0;
DECLARE arrayText longtext default "";
DECLARE Value longtext DEFAULT "";
DECLARE val longtext DEFAULT "";
DECLARE done INT DEFAULT FALSE;
DECLARE cityCursor CURSOR FOR SELECT * FROM `Cities`;
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = TRUE;
OPEN cityCursor;
loop_through_rows:
LOOP
FETCH cityCursor INTO Value;
IF done THEN
LEAVE loop_through_rows;
END IF;
SET val = Value;
iterator:
LOOP
IF LENGTH(TRIM(val)) = 0 OR val IS NULL THEN
LEAVE iterator;
END IF;
SET front = LOWER(SUBSTRING(val,1,1));
SET count = LENGTH(Value) - LENGTH(REPLACE(LOWER(Value), front, ''));
IF count > 1 THEN
IF LENGTH(TRIM(arrayText)) = 0 THEN
SET arrayText = Value;
ELSE
SET arrayText = CONCAT(arrayText,",",Value);
END IF;
LEAVE iterator;
END IF;
IF LENGTH(TRIM(val)) > 1 THEN
SET val = SUBSTRING(val,2,LENGTH(TRIM(val)));
ELSE
SET val = "";
END IF;
END LOOP;
END LOOP;
SELECT * FROM `Cities` WHERE FIND_IN_SET(City, arrayText);
END

T-SQL: split and aggregate comma-separated values

I have the following table with each row having comma-separated values:
ID
-----------------------------------------------------------------------------
10031,10042
10064,10023,10060,10065,10003,10011,10009,10012,10027,10004,10037,10039
10009
20011,10027,10032,10063,10023,10033,20060,10012,10020,10031,10011,20036,10041
I need to get a count for each ID (a groupby).
I am just trying to avoid cursor implementation and stumped on how to do this without cursors.
Any Help would be appreciated !
You will want to use a split function:
create FUNCTION [dbo].[Split](#String varchar(MAX), #Delimiter char(1))
returns #temptable TABLE (items varchar(MAX))
as
begin
declare #idx int
declare #slice varchar(8000)
select #idx = 1
if len(#String)<1 or #String is null return
while #idx!= 0
begin
set #idx = charindex(#Delimiter,#String)
if #idx!=0
set #slice = left(#String,#idx - 1)
else
set #slice = #String
if(len(#slice)>0)
insert into #temptable(Items) values(#slice)
set #String = right(#String,len(#String) - #idx)
if len(#String) = 0 break
end
return
end;
And then you can query the data in the following manner:
select items, count(items)
from table1 t1
cross apply dbo.split(t1.id, ',')
group by items
See SQL Fiddle With Demo
Well, the solution i always use, and probably there might be a better way, is to use a function that will split everything. No use for cursors, just a while loop.
if OBJECT_ID('splitValueByDelimiter') is not null
begin
drop function splitValueByDelimiter
end
go
create function splitValueByDelimiter (
#inputValue varchar(max)
, #delimiter varchar(1)
)
returns #results table (value varchar(max))
as
begin
declare #delimeterIndex int
, #tempValue varchar(max)
set #delimeterIndex = 1
while #delimeterIndex > 0 and len(isnull(#inputValue, '')) > 0
begin
set #delimeterIndex = charindex(#delimiter, #inputValue)
if #delimeterIndex > 0
set #tempValue = left(#inputValue, #delimeterIndex - 1)
else
set #tempValue = #inputValue
if(len(#tempValue)>0)
begin
insert
into #results
select #tempValue
end
set #inputValue = right(#inputValue, len(#inputValue) - #delimeterIndex)
end
return
end
After that you can call the output like this :
if object_id('test') is not null
begin
drop table test
end
go
create table test (
Id varchar(max)
)
insert
into test
select '10031,10042'
union all select '10064,10023,10060,10065,10003,10011,10009,10012,10027,10004,10037,10039'
union all select '10009'
union all select '20011,10027,10032,10063,10023,10033,20060,10012,10020,10031,10011,20036,10041'
select value
from test
cross apply splitValueByDelimiter(Id, ',')
Hope it helps, although i am still looping through everything
After reiterating the comment above about NOT putting multiple values into a single column (Use a separate child table with one value per row!),
Nevertheless, one possible approach: use a UDF to convert delimited string to a table. Once all the values have been converted to tables, combine all the tables into one table and do a group By on that table.
Create Function dbo.ParseTextString (#S Text, #delim VarChar(5))
Returns #tOut Table
(ValNum Integer Identity Primary Key,
sVal VarChar(8000))
As
Begin
Declare #dlLen TinyInt -- Length of delimiter
Declare #wind VarChar(8000) -- Will Contain Window into text string
Declare #winLen Integer -- Length of Window
Declare #isLastWin TinyInt -- Boolean to indicate processing Last Window
Declare #wPos Integer -- Start Position of Window within Text String
Declare #roVal VarChar(8000)-- String Data to insert into output Table
Declare #BtchSiz Integer -- Maximum Size of Window
Set #BtchSiz = 7900 -- (Reset to smaller values to test routine)
Declare #dlPos Integer -- Position within Window of next Delimiter
Declare #Strt Integer -- Start Position of each data value within Window
-- -------------------------------------------------------------------------
-- ---------------------------
If #delim is Null Set #delim = '|'
If DataLength(#S) = 0 Or
Substring(#S, 1, #BtchSiz) = #delim Return
-- --------------------------------------------
Select #dlLen = DataLength(#delim),
#Strt = 1, #wPos = 1,
#wind = Substring(#S, 1, #BtchSiz)
Select #winLen = DataLength(#wind),
#isLastWin = Case When DataLength(#wind) = #BtchSiz
Then 0 Else 1 End,
#dlPos = CharIndex(#delim, #wind, #Strt)
-- --------------------------------------------
While #Strt <= #winLen
Begin
If #dlPos = 0 Begin -- No More delimiters in window
If #isLastWin = 1 Set #dlPos = #winLen + 1
Else Begin
Set #wPos = #wPos + #Strt - 1
Set #wind = Substring(#S, #wPos, #BtchSiz)
-- ----------------------------------------
Select #winLen = DataLength(#wind), #Strt = 1,
#isLastWin = Case When DataLength(#wind) = #BtchSiz
Then 0 Else 1 End,
#dlPos = CharIndex(#delim, #wind, 1)
If #dlPos = 0 Set #dlPos = #winLen + 1
End
End
-- -------------------------------
Insert #tOut (sVal)
Select LTrim(Substring(#wind,
#Strt, #dlPos - #Strt))
-- -------------------------------
-- Move #Strt to char after last delimiter
Set #Strt = #dlPos + #dlLen
Set #dlPos = CharIndex(#delim, #wind, #Strt)
End
Return
End
Then write, (using your table schema),
Declare #AllVals VarChar(8000)
Select #AllVals = Coalesce(#allVals + ',', '') + ID
From Table Where ID Is Not null
-- -----------------------------------------
Select sVal, Count(*)
From dbo.ParseTextString(#AllVals, ',')
Group By sval

Fetch the occurrences of particular words in particular column of a table

I have near about 200 words. I want to see how many times those words occurred in a column of a table.
e.g: say we have table test with column statements which has two rows.
How are you. It's been long since I met you.
I am fine how are you.
Now I want to find the the occurrences of words "you" and "how". Output should be something like:
word count
you 3
how 2
since "you" has 3 and how has 2 occurrences in the two rows.
How can I do this?
You can do it like this:
Split the phrase and put all items in a different table;
Remove all ponctuation;
Make a select using the created table and the words that you want to identify.
The way I would approach this is to write a little user defined function to give me the number of times one string appears in another with some allowances for:
upper and lower case
common punctuation
I would then create a table with all of the words that I wish to search with i.e. your 200 list. Then use the function to count the number of occurrences of each word in every phrase, put that in a inline view and then sum the results up by search word.
Hence:
User Defined Function
DELIMITER $$
CREATE FUNCTION `get_word_count`(phrase VARCHAR(500),word VARCHAR(255), delimiter VARCHAR(1)) RETURNS int(11)
READS SQL DATA
BEGIN
DECLARE cur_position INT DEFAULT 1 ;
DECLARE remainder TEXT;
DECLARE cur_string VARCHAR(255);
DECLARE delimiter_length TINYINT UNSIGNED;
DECLARE total INT;
DECLARE result DOUBLE DEFAULT 0;
DECLARE string2 VARCHAR(255);
SET remainder = replace(phrase,'!',' ');
SET remainder = replace(remainder,'.',' ');
SET remainder = replace(remainder,',',' ');
SET remainder = replace(remainder,'?',' ');
SET remainder = replace(remainder,':',' ');
SET remainder = replace(remainder,'(',' ');
SET remainder = lower(remainder);
SET string2 = concat(delimiter,trim(word),delimiter);
SET delimiter_length = CHAR_LENGTH(delimiter);
SET cur_position = 1;
WHILE CHAR_LENGTH(remainder) > 0 AND cur_position > 0 DO
SET cur_position = INSTR(remainder, delimiter);
IF cur_position = 0 THEN
SET cur_string = remainder;
ELSE
SET cur_string = concat(delimiter,LEFT(remainder, cur_position - 1),delimiter);
END IF;
IF TRIM(cur_string) != '' THEN
set result = result + (select instr(string2,cur_string) > 0);
END IF;
SET remainder = SUBSTRING(remainder, cur_position + delimiter_length);
END WHILE;
RETURN result;
END$$
DELIMITER ;
You might have to play with this function a little depending on what allowances you need to make for punctuation and case. Hopefully you get the idea here though!
Populate tables
create table search_word
(id int unsigned primary key auto_increment,
word varchar(250) not null
);
insert into search_word (word) values ('you');
insert into search_word (word) values ('how');
insert into search_word (word) values ('to');
insert into search_word (word) values ('too');
insert into search_word (word) values ('the');
insert into search_word (word) values ('and');
insert into search_word (word) values ('world');
insert into search_word (word) values ('hello');
create table phrase_to_search
(id int unsigned primary key auto_increment,
phrase varchar(500) not null
);
insert into phrase_to_search (phrase) values ("How are you. It's been long since I met you");
insert into phrase_to_search (phrase) values ("I am fine how are you?");
insert into phrase_to_search (phrase) values ("Oh. Not bad. All is ok with the world, I think");
insert into phrase_to_search (phrase) values ("I think so too!");
insert into phrase_to_search (phrase) values ("You know what? I think so too!");
Run Query
select word,sum(word_count) as total_word_count
from
(
select phrase,word,get_word_count(phrase,word," ") as word_count
from search_word
join phrase_to_search
) t
group by word
order by total_word_count desc;
Here is a solution:
SELECT SUM(total_count) as total, value
FROM (
SELECT count(*) AS total_count, REPLACE(REPLACE(REPLACE(x.value,'?',''),'.',''),'!','') as value
FROM (
SELECT SUBSTRING_INDEX(SUBSTRING_INDEX(t.sentence, ' ', n.n), ' ', -1) value
FROM table_name t CROSS JOIN
(
SELECT a.N + b.N * 10 + 1 n
FROM
(SELECT 0 AS N UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9) a
,(SELECT 0 AS N UNION ALL SELECT 1 UNION ALL SELECT 2 UNION ALL SELECT 3 UNION ALL SELECT 4 UNION ALL SELECT 5 UNION ALL SELECT 6 UNION ALL SELECT 7 UNION ALL SELECT 8 UNION ALL SELECT 9) b
ORDER BY n
) n
WHERE n.n <= 1 + (LENGTH(t.sentence) - LENGTH(REPLACE(t.sentence, ' ', '')))
ORDER BY value
) AS x
GROUP BY x.value
) AS y
GROUP BY value
Here is the full working fiddle: http://sqlfiddle.com/#!2/17481a/1
First we do a query to extract all words as explained here by #peterm(follow his instructions if you want to customize the total number of words processed). Then we convert that into a sub-query and then we COUNT and GROUP BY the value of each word, and then make another query on top of that to GROUP BY not grouped words cases where accompanied signs might be present. ie: hello = hello! with a REPLACE
Below is the simple solution for the case when you need to count certain word occurrences, not the complete statistics:
SELECT COUNT(*) FROM `words` WHERE `row1` LIKE '%how%';
SELECT COUNT(*) FROM `words` WHERE `row1` LIKE '%you%';

SQL Server 2008 and HashBytes

I have quite a large nvarchar which I wish to pass to the HashBytes function.
I get the error:
"String or binary would be truncated.
Cannot insert the value NULL into
column 'colname', tbale 'table';
column does not allow nulls. UPDATE
fails. The statement has been
terminated."
Being ever resourceful, I discovered this was due to the HashBytes function having a maximum limit of 8000 bytes. Further searching showed me a 'solution' where my large varchar would be divided and hashed seperately and then later combined with this user defined function:
function [dbo].[udfLargeHashTable] (#algorithm nvarchar(4), #InputDataString varchar(MAX))
RETURNS varbinary(MAX)
AS
BEGIN
DECLARE
#Index int,
#InputDataLength int,
#ReturnSum varbinary(max),
#InputData varbinary(max)
SET #ReturnSum = 0
SET #Index = 1
SET #InputData = convert(binary,#InputDataString)
SET #InputDataLength = DATALENGTH(#InputData)
WHILE #Index <= #InputDataLength
BEGIN
SET #ReturnSum = #ReturnSum + HASHBYTES(#algorithm, SUBSTRING(#InputData, #Index, 8000))
SET #Index = #Index + 8000
END
RETURN #ReturnSum
END
which I call with:
set #ReportDefinitionHash=convert(int,dbo.[udfLargeHashTable]('SHA1',#ReportDefinitionForLookup))
Where #ReportDefinitionHash is int, and #ReportDefinitionForLookup is the varchar
Passing a simple char like 'test' produces a different int with my UDF than a normal call to HashBytes would produce.
Any advice on this issue?
If you can't create a function and have to use something that already exists in the DB:
sys.fn_repl_hash_binary
can be made to work using the syntax:
sys.fn_repl_hash_binary(cast('some really long string' as varbinary(max)))
Taken from: http://www.sqlnotes.info/2012/01/16/generate-md5-value-from-big-data/
Just use this function (taken from Hashing large data strings with a User Defined Function):
create function dbo.fn_hashbytesMAX
( #string nvarchar(max)
, #Algo varchar(10)
)
returns varbinary(20)
as
/************************************************************
*
* Author: Brandon Galderisi
* Last modified: 15-SEP-2009 (by Denis)
* Purpose: uses the system function hashbytes as well
* as sys.fn_varbintohexstr to split an
* nvarchar(max) string and hash in 8000 byte
* chunks hashing each 8000 byte chunk,,
* getting the 40 byte output, streaming each
* 40 byte output into a string then hashing
* that string.
*
*************************************************************/
begin
declare #concat nvarchar(max)
,#NumHash int
,#HASH varbinary(20)
set #NumHash = ceiling((datalength(#string)/2)/(4000.0))
/* HashBytes only supports 8000 bytes so split the string if it is larger */
if #NumHash>1
begin
-- # * 4000 character strings
;with a as (select 1 as n union all select 1) -- 2
,b as (select 1 as n from a ,a a1) -- 4
,c as (select 1 as n from b ,b b1) -- 16
,d as (select 1 as n from c ,c c1) -- 256
,e as (select 1 as n from d ,d d1) -- 65,536
,f as (select 1 as n from e ,e e1) -- 4,294,967,296 = 17+ TRILLION characters
,factored as (select row_number() over (order by n) rn from f)
,factors as (select rn,(rn*4000)+1 factor from factored)
select #concat = cast((
select right(sys.fn_varbintohexstr
(
hashbytes(#Algo, substring(#string, factor - 4000, 4000))
)
, 40) + ''
from Factors
where rn <= #NumHash
for xml path('')
) as nvarchar(max))
set #HASH = dbo.fn_hashbytesMAX(#concat ,#Algo)
end
else
begin
set #HASH = convert(varbinary(20), hashbytes(#Algo, #string))
end
return #HASH
end
And the results are as following:
select
hashbytes('sha1', N'test') --native function with nvarchar input
,hashbytes('sha1', 'test') --native function with varchar input
,dbo.fn_hashbytesMAX('test', 'sha1') --Galderisi's function which casts to nvarchar input
,dbo.fnGetHash('sha1', 'test') --your function
Output:
0x87F8ED9157125FFC4DA9E06A7B8011AD80A53FE1
0xA94A8FE5CCB19BA61C4C0873D391E987982FBBD3
0x87F8ED9157125FFC4DA9E06A7B8011AD80A53FE1
0x00000000AE6DBA4E0F767D06A97038B0C24ED720662ED9F1
I've taken the accepted answer, and modified it a bit with the following improvements:
no longer recursive function
now schema bound
no longer relying on undocumented stored procedures
two versions: one for nvarchar, one for varchar
returns same data size as HASHBYTES, leaving it up to the end user to convert to smaller based on algorithm used. This allows the functions to support future algorithms with larger data returns.
With these changes, the functions can now be used in persisted computed columns as they are now marked deterministic when created.
CREATE FUNCTION dbo.fnHashBytesNVARCHARMAX
(
#Algorithm VARCHAR(10),
#Text NVARCHAR(MAX)
)
RETURNS VARBINARY(8000)
WITH SCHEMABINDING
AS
BEGIN
DECLARE #NumHash INT;
DECLARE #HASH VARBINARY(8000);
SET #NumHash = CEILING(DATALENGTH(#Text) / (8000.0));
/* HashBytes only supports 8000 bytes so split the string if it is larger */
WHILE #NumHash > 1
BEGIN
-- # * 4000 character strings
WITH a AS
(SELECT 1 AS n UNION ALL SELECT 1), -- 2
b AS
(SELECT 1 AS n FROM a, a a1), -- 4
c AS
(SELECT 1 AS n FROM b, b b1), -- 16
d AS
(SELECT 1 AS n FROM c, c c1), -- 256
e AS
(SELECT 1 AS n FROM d, d d1), -- 65,536
f AS
(SELECT 1 AS n FROM e, e e1), -- 4,294,967,296 = 17+ TRILLION characters
factored AS
(SELECT ROW_NUMBER() OVER (ORDER BY n) rn FROM f),
factors AS
(SELECT rn, (rn * 4000) + 1 factor FROM factored)
SELECT #Text = CAST
(
(
SELECT CONVERT(VARCHAR(MAX), HASHBYTES(#Algorithm, SUBSTRING(#Text, factor - 4000, 4000)), 1)
FROM factors
WHERE rn <= #NumHash
FOR XML PATH('')
) AS NVARCHAR(MAX)
);
SET #NumHash = CEILING(DATALENGTH(#Text) / (8000.0));
END;
SET #HASH = CONVERT(VARBINARY(8000), HASHBYTES(#Algorithm, #Text));
RETURN #HASH;
END;
CREATE FUNCTION dbo.fnHashBytesVARCHARMAX
(
#Algorithm VARCHAR(10),
#Text VARCHAR(MAX)
)
RETURNS VARBINARY(8000)
WITH SCHEMABINDING
AS
BEGIN
DECLARE #NumHash INT;
DECLARE #HASH VARBINARY(8000);
SET #NumHash = CEILING(DATALENGTH(#Text) / (8000.0));
/* HashBytes only supports 8000 bytes so split the string if it is larger */
WHILE #NumHash > 1
BEGIN
-- # * 4000 character strings
WITH a AS
(SELECT 1 AS n UNION ALL SELECT 1), -- 2
b AS
(SELECT 1 AS n FROM a, a a1), -- 4
c AS
(SELECT 1 AS n FROM b, b b1), -- 16
d AS
(SELECT 1 AS n FROM c, c c1), -- 256
e AS
(SELECT 1 AS n FROM d, d d1), -- 65,536
f AS
(SELECT 1 AS n FROM e, e e1), -- 4,294,967,296 = 17+ TRILLION characters
factored AS
(SELECT ROW_NUMBER() OVER (ORDER BY n) rn FROM f),
factors AS
(SELECT rn, (rn * 8000) + 1 factor FROM factored)
SELECT #Text = CAST
(
(
SELECT CONVERT(VARCHAR(MAX), HASHBYTES(#Algorithm, SUBSTRING(#Text, factor - 8000, 8000)), 1)
FROM factors
WHERE rn <= #NumHash
FOR XML PATH('')
) AS NVARCHAR(MAX)
);
SET #NumHash = CEILING(DATALENGTH(#Text) / (8000.0));
END;
SET #HASH = CONVERT(VARBINARY(8000), HASHBYTES(#Algorithm, #Text));
RETURN #HASH;
END;
You could write a SQL CLR function:
[Microsoft.SqlServer.Server.SqlFunction]
public static SqlBinary BigHashBytes(SqlString algorithm, SqlString data)
{
var algo = HashAlgorithm.Create(algorithm.Value);
var bytes = Encoding.UTF8.GetBytes(data.Value);
return new SqlBinary(algo.ComputeHash(bytes));
}
And then it can be called in SQL like this:
--these return the same value
select HASHBYTES('md5', 'test stuff')
select dbo.BigHashBytes('md5', 'test stuff')
The BigHashBytes is only necessary if the length would be over 8k.
tested and working
select master.sys.fn_repl_hash_binary(someVarbinaryMaxValue)
moreover not complicated :)
This can be used as function body, too:
DECLARE #A NVARCHAR(MAX) = N'test'
DECLARE #res VARBINARY(MAX) = 0x
DECLARE #position INT = 1
,#len INT = DATALENGTH(#A)
WHILE 1 = 1
BEGIN
SET #res = #res + HASHBYTES('SHA2_256', SUBSTRING(#A, #position, 4000))
SET #position = #position+4000
IF #Position > #len
BREAK
END
SELECT HASHBYTES('SHA2_256',#res)
The idea si to HASH each 4000 part of the NVARCHAR(MAX) string and concatanate the results. Then to HASH the latter result.
It seems the easiest solution is to write a recursive hashing algorithm that parses the input text value into sub varchar(8000) segments.
I arbitrarily chose to slice the input string into 7500 character segments
The hashing algorithm returns a varbinary(20) which can easily be converted into a varchar(20)
ALTER FUNCTION [dbo].[BigHash]
(
#TextValue nvarchar(max)
)
RETURNS varbinary(20)
AS
BEGIN
if #TextValue = null
return hashbytes('SHA1', 'null')
Declare #FirstPart as varchar(7500)
Declare #Remainder as varchar(max)
Declare #RemainderHash as varbinary(20)
Declare #BinaryValue as varbinary(20)
Declare #TextLength as integer
Set #TextLength = len(#TextValue)
if #TextLength > 7500
Begin
Set #FirstPart = substring(#TextValue, 1, 7500)
Set #Remainder = substring(#TextValue, 7501, #TextLength - 7500)
Set #RemainderHash = dbo.BigHash(#Remainder)
Set #BinaryValue = hashbytes('SHA1', #FirstPart + convert( varchar(20), #RemainderHash, 2 ))
return #BinaryValue
End
else
Begin
Set #FirstPart = substring(#TextValue, 1, #TextLength)
Set #BinaryValue = hashbytes('SHA1', #FirstPart)
return #BinaryValue
End
return null
END