I have an application that allows users to persist strings to a database and those strings may contain emojis. The problem I have is an emoji such as 😊 will get stored in MySQL as 😊
When I retrieve this string using a PHP MySQL client and render it in a web browser, it renders fine probably because the Content-Type is set to UTF-8. When I try to read the string in node.js, I get back what I think is the ISO8859-1 encoding a literal 😊. The charset on the table is set to latin1 and that's where I'm getting ISO8859-1 from.
What's the right way to encode the string in node.js so that I can see the emoji and not the encoding set by MySQL when I console.log the string?
😊 is Mojibake for 😊. Interpreting the former as latin1, you get hex F09F988A, which is the UTF-8 hex for that Emoji.
(Note: UTF-8 outside MySQL is equivalent to utf8mb4 inside MySQL.)
In MySQL, you must have the column/table declared with CHARACTER SET utf8mb4. You must also state that the data being stored/fetched is encoded utf8mb4. Note: utf8 will not suffice.
Do a SELECT HEX(col) FROM ... to see if you get that hex for that Emoji. If that is the case and the column is currently latin1, then part of the fix is to carefully convert the column to utf8mb4. That is, you have CHARACTER SET latin1, but have UTF-8 bytes in it; this will leave bytes alone while fixing charset. Assuming the column is already VARCHAR(111) CHARACTER SET latin1 NOT NULL, then do this 2-step ALTER:
ALTER TABLE tbl MODIFY COLUMN col VARBINARY(111) NOT NULL;
ALTER TABLE tbl MODIFY COLUMN col VARCHAR(111) CHARACTER SET utf8mb4 NOT NULL;
Virtually any other conversion mechanism will make a worse mess.
As for establishing the connection correctly, it goes something like this for node.js:
var connection = mysql.createConnection({ ... , charset : 'utf8mb4'});
You do not need, and should not convert encoding. Just use the right protocols. If you send the HTML page in UTF-8, the browser will send the data back to your server in UTF-8.
Then you want to store the data to your database which is in latin1, that won't work at all. You must convert your database to UTF-8 as well. That includes the database, the tables, and eventually the columns themselves. Also make sure that your database client is configured to connect in UTF-8, because the client itself has to declare its encoding.
Once you have the whole data-flux in UTF-8, everything will work flawlessly.
Server -> GET HTML -> POST -> Server -> SQL Client -> Database -> Table -> Column
It is recommended to use iconv(A simple ISO-8859-1 to UTF-8 conversion)
From this gist
var iconv = require('iconv');
function toUTF8(body) {
// convert from iso-8859-1 to utf-8
var ic = new iconv.Iconv('iso-8859-1', 'utf-8');
var buf = ic.convert(body);
return buf.toString('utf-8');
}
here if you pass anything in ISO-8859-1 , it will return it's UTF-8.
for example,
toUTF8("😊");
will return 😊
I have found a super dirty way to convert it back:
const isoToUtfTable = {
'ð': 0xf0,
'Ÿ': 0x9f,
'˜': 0x98,
'Š': 0x8a
};
function convertISO8859ToUtf8(s) {
const buf = new Uint8Array([...s].map(c => isoToUtfTable[c]));
return String.fromCharCode(...buf)
}
function decode_utf8(s) {
return decodeURIComponent(escape(s));
}
console.log(decode_utf8(convertISO8859ToUtf8('😊')))
Now you simply need to complete the isoToUtfTable table (it's small, see https://en.wikipedia.org/wiki/ISO/IEC_8859-1).
Maybe try to look at node-iconv.
const iconv = new Iconv('ISO-8859-2', 'UTF-8');
const buffer = iconv.convert(something);
console.log(buffer);
console.log(buffer.toString('UTF8'));
This is a full answer from #zerkms solution
const isoToUtfTable = {
'€':0x80,
'na':0x81,
'‚':0x82,
'ƒ':0x83,
'„':0x84,
'…':0x85,
'†':0x86,
'‡':0x87,
'ˆ':0x88,
'‰':0x89,
'Š':0x8a,
'‹':0x8b,
'Œ':0x8c,
'na':0x8d,
'Ž':0x8e,
'na':0x8f,
'na':0x90,
'‘':0x91,
'’':0x92,
'“':0x93,
'”':0x94,
'•':0x95,
'–':0x96,
'—':0x97,
'˜':0x98,
'™':0x99,
'š':0x9a,
'›':0x9b,
'œ':0x9c,
'na':0x9d,
'ž':0x9e,
'Ÿ':0x9f,
'NSBP':0xa0,
'¡':0xa1,
'¢':0xa2,
'£':0xa3,
'¤':0xa4,
'¥':0xa5,
'¦':0xa6,
'§':0xa7,
'¨':0xa8,
'©':0xa9,
'ª':0xaa,
'«':0xab,
'¬':0xac,
'SHY':0xad,
'®':0xae,
'¯':0xaf,
'°':0xb0,
'±':0xb1,
'²':0xb2,
'³':0xb3,
'´':0xb4,
'µ':0xb5,
'¶':0xb6,
'·':0xb7,
'¸':0xb8,
'¹':0xb9,
'º':0xba,
'»':0xbb,
'¼':0xbc,
'½':0xbd,
'¾':0xbe,
'¿':0xbf,
'À':0xc0,
'Á':0xc1,
'Â':0xc2,
'Ã':0xc3,
'Ä':0xc4,
'Å':0xc5,
'Æ':0xc6,
'Ç':0xc7,
'È':0xc8,
'É':0xc9,
'Ê':0xca,
'Ë':0xcb,
'Ì':0xcc,
'Í':0xcd,
'Î':0xce,
'Ï':0xcf,
'Ð':0xd0,
'Ñ':0xd1,
'Ò':0xd2,
'Ó':0xd3,
'Ô':0xd4,
'Õ':0xd5,
'Ö':0xd6,
'×':0xd7,
'Ø':0xd8,
'Ù':0xd9,
'Ú':0xda,
'Û':0xdb,
'Ü':0xdc,
'Ý':0xdd,
'Þ':0xde,
'ß':0xdf,
'à':0xe0,
'á':0xe1,
'â':0xe2,
'ã':0xe3,
'ä':0xe4,
'å':0xe5,
'æ':0xe6,
'ç':0xe7,
'è':0xe8,
'é':0xe9,
'ê':0xea,
'ë':0xeb,
'ì':0xec,
'í':0xed,
'î':0xee,
'ï':0xef,
'ð':0xf0,
'ñ':0xf1,
'ò':0xf2,
'ó':0xf3,
'ô':0xf4,
'õ':0xf5,
'ö':0xf6,
'÷':0xf7,
'ø':0xf8,
'ù':0xf9,
'ú':0xfa,
'û':0xfb,
'ü':0xfc,
'ý':0xfd,
'þ':0xfe,
'ÿ':0xff }
let offsetArray = [];
function convertISO8859ToUtf8Simple(s) {
offsetArray = [];
const buf = new Uint8Array([...s].map((c, index) =>
{
if(isoToUtfTable[c]) {
if(offsetArray.length > 0 && offsetArray[offsetArray.length -1]+3 < index) {
offsetArray.push(index);
}
if(offsetArray.length == 0) {
offsetArray.push(index);
}
}
return isoToUtfTable[c];
}
));
return String.fromCharCode(...buf);
}
function decode_utf8(s) {
return decodeURIComponent(escape(s));
}
function emojiStringToArray(str) {
split = str.split(/([\uD800-\uDBFF][\uDC00-\uDFFF])/);
arr = [];
for (var i=0; i<split.length; i++) {
char = split[i]
if (char !== "" && !char.includes('\u0000')) {
arr.push(char);
}
}
return arr;
};
const string = 'hello 😌😌 with some emojis 😊 🐝';
function finalString(s) {
const emojis = emojiStringToArray(decode_utf8(convertISO8859ToUtf8Simple(s)));
for(let i = 0; i<offsetArray.length; i++){
let position = 0;
if (i == 0) {
position = offsetArray[i];
} else {
position = (i * -3) + offsetArray[i] + (i);
}
s = [s.slice(0, position), emojis[i], s.slice(position+4)].join('');
}
return s;
}
console.log(finalString(string));
Related
I created in Go some conversion for decoding inputs.
It is working as charm, (thanks to "golang.org/x/net/html/charset"), but now I have to limit output to characters contained only in utf8mb3.
As far as I know, go default "builtin" is full utf-8.
The problem is that the underlying database setting is locked by vendor rules and setted to utf8mb3 (yep mysql), we can't change those.
So far I'm using this to limit characters and rewrite "unallowed" to "*":
//compile our regexp. if fails, return undecoded
allowedCharsREGEX = `[^ěščřžýáíéúůťňĺľŕĚŠČŘŽÝÁÍÉÚŮŤŇĹĽŔ!?§©®±%¼½¾¿ß÷£¥¢~¡#&_\"\\/:;a-zA-Z_0-9\t\n\r\ ]`
reg := regexp.MustCompile(allowedCharsREGEX)
procString := outStr
// replace not allowed chars
procString = reg.ReplaceAllString(outStr,"*")
to limit output characters but want to expand it to utf8mb3 char list.
From documentation seems unicode IsValid is full utf8.
Any possible "quick solution"?
Go v.1.13, ubuntu 20.04
Not everything should be done with a regexp.
utf8mb3 contains all runes from the BMP which can be encoded with 3 Bytes in UTF-8.
sb := &strings.Builder{}
for _, r := range input {
if r < 0xFFFF {
sb.WriteRune(r)
} else {
sb.WriteByte('*')
}
}
return sb.String
I have a table I need to handle various characters. The characters include Ø, ® etc.
I have set my table to utf-8 as the default collation, all columns use table default, however when I try to insert these characters I get error: Incorrect string value: '\xEF\xBF\xBD' for column 'buyerName' at row 1
My connection string is defined as
string mySqlConn = "server="+server+";user="+username+";database="+database+";port="+port+";password="+password+";charset=utf8;";
I am at a loss as to why I am still seeing errors. Have I missed anything with either the .net connector, or with my MySQL setup?
--Edit--
My (new) C# insert statement looks like:
MySqlCommand insert = new MySqlCommand( "INSERT INTO fulfilled_Shipments_Data " +
"(amazonOrderId,merchantOrderId,shipmentId,shipmentItemId,"+
"amazonOrderItemId,merchantOrderItemId,purchaseDate,"+ ...
VALUES (#amazonOrderId,#merchantOrderId,#shipmentId,#shipmentItemId,"+
"#amazonOrderItemId,#merchantOrderItemId,#purchaseDate,"+
"paymentsDate,shipmentDate,reportingDate,buyerEmail,buyerName,"+ ...
insert.Parameters.AddWithValue("#amazonorderId",lines[0]);
insert.Parameters.AddWithValue("#merchantOrderId",lines[1]);
insert.Parameters.AddWithValue("#shipmentId",lines[2]);
insert.Parameters.AddWithValue("#shipmentItemId",lines[3]);
insert.Parameters.AddWithValue("#amazonOrderItemId",lines[4]);
insert.Parameters.AddWithValue("#merchantOrderItemId",lines[5]);
insert.Parameters.AddWithValue("#purchaseDate",lines[6]);
insert.Parameters.AddWithValue("#paymentsDate",lines[7]);
insert.ExecuteNonQuery();
Assuming that this is the correct way to use parametrized statements, it is still giving an error
"Incorrect string value: '\xEF\xBF\xBD' for column 'buyerName' at row 1"
Any other ideas?
\xEF\xBF\xBD is the UTF-8 encoding for the unicode character U+FFFD. This is a special character, also known as the "Replacement character". A quote from the wikipedia page about the special unicode characters:
The replacement character � (often a black diamond with a white question mark) is a symbol found in the Unicode standard at codepoint U+FFFD in the Specials table. It is used to indicate problems when a system is not able to decode a stream of data to a correct symbol. It is most commonly seen when a font does not contain a character, but is also seen when the data is invalid and does not match any character:
So it looks like your data source contains corrupted data. It is also possible that you try to read the data using the wrong encoding. Where do the lines come from?
If you can't fix the data, and your input indeed contains invalid characters, you could just remove the replacement characters:
lines[n] = lines[n].Replace("\xFFFD", "");
Mattmanser is right, never write a sql query by concatenating the parameters directly in the query. An example of parametrized query is:
string lastname = "Doe";
double height = 6.1;
DateTime date = new DateTime(1978,4,18);
var connection = new MySqlConnection(connStr);
try
{
connection.Open();
var command = new MySqlCommand(
"SELECT * FROM tblPerson WHERE LastName = #Name AND Height > #Height AND BirthDate < #BirthDate", connection);
command.Parameters.AddWithValue("#Name", lastname);
command.Parameters.AddWithValue("#Height", height);
command.Parameters.AddWithValue("#Name", birthDate);
MySqlDataReader reader = command.ExecuteReader();
...
}
finally
{
connection.Close();
}
To those who have a similar problem using PHP, try the function utf8_encode($string). It just works!
I have this some problem, when my website encoding is utf-u and I tried to send in form CP-1250 string (example taken by listdir dictionaries).
I think you must send string encoded like website.
I am inserting French language text into nvarchar column in SQL server 2008. The French accented characters are not stored properly in the SQL DB.
string strData = "Accented chars- Les caractères accentués français ";
DataTable dtTemp = new DataTable();
dtTemp.Columns.Add("ID", typeof(string));
dtTemp.Columns.Add("Value", typeof(string));
DataRow dr = dtTemp.NewRow();
dr["ID"] = "100";
dr["Value"] = strData;
dtTemp.Rows.Add(dr);
strSQLCon = GetSQLConnectionString();
using (SqlConnection cn = new SqlConnection(strSQLCon))
{
cn.Open();
using (SqlBulkCopy copy = new SqlBulkCopy(cn))
{
copy.ColumnMappings.Add("ID", "ID");
copy.ColumnMappings.Add("Value", "Value");
copy.DestinationTableName = "MYTABLE";
copy.WriteToServer(dtTemp);
}
}
The French characters are not stored properly in SQL server data base.
It works fine when i do a normal insert query. insert into MYTABLEvalues(1 , 'Accented chars- Les caractères accentués français')
Please let me know why it does not work with SQL Bulk copy class. Any settings need to be changed or C# code needs to be modified to store the non-English characters properly.
I am designing this table, the collation for every column is set to French_CI_AS, French culture, accent sensitive. Every sql string type considered.
I am building a typed dataset for this table (not the purpose of this question).
Sql bulk copy:
var ds = new FrenchCharacters.FrenchDataSet();
using (var destinationConnection = new SqlConnection(StackOverflow.Properties.Settings.Default.StackOverflowConnectionString))
{
destinationConnection.Open();
//all French characters http://en.wikipedia.org/wiki/French_orthography
string[] sArray = new string[] {
"Àà, Ââ, Ææ, Ää"
, "Çç"
, "Îî, Ïï"
, "Ôô, Œœ, Öö"
, "Ùù, Ûû, Üü"
, "Ÿÿ"
};
// open the connection
using (SqlBulkCopy bulkCopy = new SqlBulkCopy(destinationConnection.ConnectionString))
{
bulkCopy.BatchSize = 500;
bulkCopy.NotifyAfter = 10000;
bulkCopy.DestinationTableName = "French";
//
// build data table to be written to the server
// data table is now strongly-typed ds.French
//
for (int i = 0; i < 100; i++)
{
foreach (string s in sArray)
ds.French.AddFrenchRow(s, s, s, s, s, s);
}
//
bulkCopy.WriteToServer(ds.French);
}
}
result:
Notice no invalid entries whatever the sql char type!.
I tested your code on the following table and it worked fine, at least on SQL Server 2012.
CREATE TABLE [dbo].[MYTABLE](
[ID] [varchar](20) NOT NULL,
[Value] [nvarchar](255) NOT NULL)
thanks for your replies. The issue seems to occur while reading the csv file into data table before bulk insert. I included the encoding parameter while reading the csv file. (Encoding.Default) and it loads the french text properly and it gets stored in SQL DB without any issues.
old code:
List lstData = File.ReadAllLines(stFile).ToList();
Working code:
List lstData = File.ReadAllLines(stFile, Encoding.Default).ToList();
thanks
Ashok
I am using a blowfish encryted function:
class Blowfish {
private $key;
private $iv;
public function __construct($key, $iv) {
$this->key = $key;
$this->iv = $iv;
}
public function encrypt($data) {
return mcrypt_encrypt(MCRYPT_BLOWFISH, $this->key, $data, MCRYPT_MODE_CBC, $this->iv);
}
public function decrypt($data) {
return mcrypt_decrypt(MCRYPT_BLOWFISH, $this->key, $data, MCRYPT_MODE_CBC, $this->iv);
}}
Now when I am encrypted something with this function like:
$blowfish = new blowfish('DfRgBWE4Y4T7UgTWEdFP1Y', '85440934');
echo $pass = $blowfish -> encrypt('12345');
which should displays --> ' ìI”‹YR” '
but I can't save it in database table. My table structure is like:
`Password` varchar(255) CHARACTER SET utf32 NOT NULL
But every time I try to insert it, the insertion is like " ?I??YR? "
what to do now?
I suggest to not store (potentially) binary, i.e. non-AlphaNumeric data, in an SQL database, especially not inside varchar(..) which will mess things up when you try to look at the the data.
Instead, either Base64 or HEX encode it, HEX is probably the better option, for HEX it is very easy and native for MySQL:
INSERT INTO table SET Password = HEX(?)
The ? is a placeholder for your Blowfish result
To pick up the binary data from the database do:
INSERT UNHEX(Password) FROM table
Is there a way to specify certain part of a html file as another encoding?
The default encoding for the (generated) html is utf-8. However, some of the included data to be inserted in the html is in another encoding. It's something like:
<div>
the normal html in utf-8
</div>
<div>
<%= raw_data_in_another_encoding %>
</div>
Is there a way to hint a browser to render the 2nd <div> in another encoding? thanks
No, the entire file must have a single encoding. If you're saving a plain .html file, you'll have to convert the entire file to one encoding.
If you're using a server-side scripting language, however, you can always convert text from one encoding to another. You might designate UTF-8 as the encoding for the page, and then when you encounter bits of content currently encoded in, say, latin1, you can simply convert it to UTF-8 before outputting it.
How you do that, of course, would depend on the particular server-side language you're using.
In PHP, you could do:
echo iconv('ISO-8859-1', 'UTF-8', $someLatin1Text);
You can send any arbitrary encoding at any point in your HTTP response stream, but generally your client won't be able to deal with it. In HTML, multiple encodings in the same document simply aren't permitted. Or even gracefully handled by any modern client except perhaps by accident.
If you are using Ruby (guessing based only on your naming conventions), you can convert a string from one encoding to another using the iconv library. If you're using something else, there's most likely a similar alternative. PHP and Python both offer some encoding translation options based on the iconv library. In the .Net Framework, you can use the Encoding class to grab the suitable source encoding, and call GetBytes with your source byte array as the parameter to get a string suitable for further manipulation.
Numerical character references are another option, if you are primarily using another encoding and only occasionally using characters outside of that encoding's supported range. However, you're generally going to stay saner by converting to and from UTF-8 from legacy encodings.
I think you can't but if you need some text be showed in a different encoding, you can do a "translating function". I had a similar problem with an english page where I had to add some spannish messages, so I do something like this:
function spanishEncoding (string) {
var res = string;
res = res.replace( /á/g, "\u00e1" );
res = res.replace( /Á/g, "\u00c1" );
res = res.replace( /é/g, "\u00e9" );
res = res.replace( /É/g, "\u00c9" );
res = res.replace( /í/g, "\u00ed" );
res = res.replace( /Í/g, "\u00cd" );
res = res.replace( /ó/g, "\u00f3" );
res = res.replace( /Ó/g, "\u00d3" );
res = res.replace( /ú/g, "\u00fa" );
res = res.replace( /Ú/g, "\u00da" );
res = res.replace( /ñ/g, "\u00f1" );
res = res.replace( /Ñ/g, "\u00d1" );
return res; };
var newDiv = window.content.document.createElement("div");
newDiv.appendChild(window.content.document.createTextNode("Esta página")); //This shows "Esta p*Â!*gina"
var anotherDiv = window.content.document.createElement("div");
anotherDiv.appendChild(window.content.document.createTextNode(spanishEncoding("Esta página"))); //This shows "Esta página"
Hope it help you!