Samsung 1242 format uses 16-bits UTF16 as payload inside 3 byte UTF8 sequences.

Characters showed up as Chinese when the raw data was interpreted directly as UTF16 (both little and big endian)
This commit is contained in:
Horst Beham
2020-07-13 10:00:41 +02:00
parent 180ad35d8e
commit a10fb8b353
5 changed files with 155 additions and 36 deletions

View File

@@ -75,6 +75,7 @@
<Compile Include="DbSerializer.cs" />
<Compile Include="DbSerializerPlugin.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="Utf16InsideUtf8EnvelopeEncoding.cs" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\ChanSort.Api\ChanSort.Api.csproj">

View File

@@ -4,13 +4,14 @@ using System.Data;
using System.Data.SQLite;
using System.IO;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Text;
using ChanSort.Api;
namespace ChanSort.Loader.SamsungJ
{
/// <summary>
/// Loader for Samsung J/K/M/N/R/Q series .zip files (2015 - 2019+)
/// Loader for Samsung J/K/M/N/R/Q series .zip files (2015 - 2020)
/// </summary>
class DbSerializer : SerializerBase
{
@@ -218,11 +219,15 @@ namespace ChanSort.Loader.SamsungJ
try
{
cmd.CommandText = "select provId, cast(provName as blob) from PROV";
var prevEncoding = this.encoding;
this.encoding = Encoding.BigEndianUnicode; // while Sat and Service names might be utf16 binary data inside an utf8 envelope, the providers are always plain utf16
using (var r = cmd.ExecuteReader())
{
while (r.Read())
dict.Add(r.GetInt64(0), ReadUtf16(r, 1));
}
this.encoding = prevEncoding;
}
catch
{
@@ -392,7 +397,7 @@ namespace ChanSort.Loader.SamsungJ
return null;
byte[] nameBytes = new byte[200];
int nameLen = (int)r.GetBytes(fieldIndex, 0, nameBytes, 0, nameBytes.Length);
this.encoding ??= AutoDetectUtf16Endian(nameBytes, nameLen);
this.encoding ??= AutoDetectUtf16Encoding(nameBytes, nameLen);
if (this.encoding == null)
return string.Empty;
@@ -401,24 +406,35 @@ namespace ChanSort.Loader.SamsungJ
#endregion
#region AutoDetectUtf16Endian()
private Encoding AutoDetectUtf16Endian(byte[] nameBytes, int nameLen)
private Encoding AutoDetectUtf16Encoding(byte[] nameBytes, int nameLen)
{
if (this.DefaultEncoding is UnicodeEncoding)
return this.DefaultEncoding;
int evenBytesZero = 0;
int oddBytesZero = 0;
int bytesAbove128 = 0;
for (int i = 0; i < nameLen; i += 2)
{
if (nameBytes[i] == 0)
++evenBytesZero;
if (nameBytes[i] >= 128)
++bytesAbove128;
if (nameBytes[i + 1] == 0)
++oddBytesZero;
if (nameBytes[i + 1] >= 128)
++bytesAbove128;
}
if (evenBytesZero + oddBytesZero == nameLen)
return null;
if (bytesAbove128 + 1 >= nameLen)
{
//this.Features.ChannelNameEdit = ChannelNameEditMode.None; // unclear if the encoder produces byte sequences that the TV can decode again
return new Utf16InsideUtf8EnvelopeEncoding();
}
return evenBytesZero >= oddBytesZero ? Encoding.BigEndianUnicode : Encoding.Unicode;
}
@@ -482,45 +498,44 @@ namespace ChanSort.Loader.SamsungJ
#region SaveChannelList()
private void SaveChannelList(ChannelList channelList, string dbPath)
{
using (var conn = new SQLiteConnection("Data Source=" + dbPath))
using var conn = new SQLiteConnection("Data Source=" + dbPath);
conn.Open();
using var cmdUpdateSrv = PrepareUpdateCommand(conn);
using var cmdDeleteSrv = PrepareDeleteCommand(conn, (channelList.SignalSource & SignalSource.Digital) != 0);
using var cmdInsertFav = PrepareInsertFavCommand(conn);
using var cmdUpdateFav = PrepareUpdateFavCommand(conn);
using var cmdDeleteFav = PrepareDeleteFavCommand(conn);
using (var trans = conn.BeginTransaction())
{
conn.Open();
using (var cmdUpdateSrv = PrepareUpdateCommand(conn))
using (var cmdDeleteSrv = PrepareDeleteCommand(conn, (channelList.SignalSource & SignalSource.Digital) != 0))
using (var cmdInsertFav = PrepareInsertFavCommand(conn))
using (var cmdUpdateFav = PrepareUpdateFavCommand(conn))
using (var cmdDeleteFav = PrepareDeleteFavCommand(conn))
{
using (var trans = conn.BeginTransaction())
{
Editor.SequentializeFavPos(channelList, 5);
this.WriteChannels(cmdUpdateSrv, cmdDeleteSrv, cmdInsertFav, cmdUpdateFav, cmdDeleteFav, channelList);
trans.Commit();
}
this.RepairCorruptedDatabaseImage(cmdUpdateSrv);
}
Editor.SequentializeFavPos(channelList, 5);
this.WriteChannels(cmdUpdateSrv, cmdDeleteSrv, cmdInsertFav, cmdUpdateFav, cmdDeleteFav, channelList);
trans.Commit();
}
this.RepairCorruptedDatabaseImage(cmdUpdateSrv);
}
#endregion
#region Prepare*Command()
private static SQLiteCommand PrepareUpdateCommand(SQLiteConnection conn)
private SQLiteCommand PrepareUpdateCommand(SQLiteConnection conn)
{
var canUpdateNames = this.Features.ChannelNameEdit != ChannelNameEditMode.None;
var cmd = conn.CreateCommand();
cmd.CommandText = "update SRV set major=@nr, lockMode=@lock, hideGuide=@hidden, hidden=@hidden, numSel=@numsel, srvName=cast(@srvname as varchar) where srvId=@id";
var updateSrvName = canUpdateNames ? ", srvName=cast(@srvname as varchar)" : "";
cmd.CommandText = "update SRV set major=@nr, lockMode=@lock, hideGuide=@hidden, hidden=@hidden, numSel=@numsel" + updateSrvName + " where srvId=@id";
cmd.Parameters.Add(new SQLiteParameter("@id", DbType.Int64));
cmd.Parameters.Add(new SQLiteParameter("@nr", DbType.Int32));
cmd.Parameters.Add(new SQLiteParameter("@lock", DbType.Boolean));
cmd.Parameters.Add(new SQLiteParameter("@hidden", DbType.Boolean));
cmd.Parameters.Add(new SQLiteParameter("@numsel", DbType.Boolean));
cmd.Parameters.Add(new SQLiteParameter("@srvname", DbType.Binary));
if (canUpdateNames)
cmd.Parameters.Add(new SQLiteParameter("@srvname", DbType.Binary));
cmd.Prepare();
return cmd;
}
private static SQLiteCommand PrepareDeleteCommand(SQLiteConnection conn, bool digital)
private SQLiteCommand PrepareDeleteCommand(SQLiteConnection conn, bool digital)
{
var cmd = conn.CreateCommand();
var sql = new StringBuilder();
@@ -536,7 +551,7 @@ namespace ChanSort.Loader.SamsungJ
return cmd;
}
private static SQLiteCommand PrepareInsertFavCommand(SQLiteConnection conn)
private SQLiteCommand PrepareInsertFavCommand(SQLiteConnection conn)
{
var cmd = conn.CreateCommand();
cmd.CommandText = "insert into SRV_FAV (srvId, fav, pos) values (@id, @fav, @pos)";
@@ -547,7 +562,7 @@ namespace ChanSort.Loader.SamsungJ
return cmd;
}
private static SQLiteCommand PrepareUpdateFavCommand(SQLiteConnection conn)
private SQLiteCommand PrepareUpdateFavCommand(SQLiteConnection conn)
{
var cmd = conn.CreateCommand();
cmd.CommandText = "update SRV_FAV set pos=@pos where srvId=@id and fav=@fav";
@@ -557,7 +572,7 @@ namespace ChanSort.Loader.SamsungJ
cmd.Prepare();
return cmd;
}
private static SQLiteCommand PrepareDeleteFavCommand(SQLiteConnection conn)
private SQLiteCommand PrepareDeleteFavCommand(SQLiteConnection conn)
{
var cmd = conn.CreateCommand();
cmd.CommandText = "delete from SRV_FAV where srvId=@id and fav=@fav";
@@ -573,7 +588,7 @@ namespace ChanSort.Loader.SamsungJ
private void WriteChannels(SQLiteCommand cmdUpdateSrv, SQLiteCommand cmdDeleteSrv, SQLiteCommand cmdInsertFav, SQLiteCommand cmdUpdateFav, SQLiteCommand cmdDeleteFav,
ChannelList channelList, bool analog = false)
{
bool canUpdateNames = this.Features.ChannelNameEdit != ChannelNameEditMode.None;
foreach (ChannelInfo channelInfo in channelList.Channels.ToList())
{
var channel = channelInfo as DbChannel;
@@ -595,7 +610,8 @@ namespace ChanSort.Loader.SamsungJ
cmdUpdateSrv.Parameters["@lock"].Value = channel.Lock;
cmdUpdateSrv.Parameters["@hidden"].Value = channel.Hidden;
cmdUpdateSrv.Parameters["@numsel"].Value = !channel.Skip;
cmdUpdateSrv.Parameters["@srvname"].Value = channel.Name == null ? (object)DBNull.Value : encoding.GetBytes(channel.Name);
if (canUpdateNames)
cmdUpdateSrv.Parameters["@srvname"].Value = channel.Name == null ? (object)DBNull.Value : encoding.GetBytes(channel.Name);
cmdUpdateSrv.ExecuteNonQuery();
// update favorites

View File

@@ -0,0 +1,101 @@
using System.IO;
using System.Text;
namespace ChanSort.Loader.SamsungJ
{
// Samsung 1242 format does not store UTF16 characters directly, but instead wraps 16 data bits inside a UTF-8 lead + continuation byte sequence.
// A 3 byte UTF-8 sequence is used to encode 16 bits of utf-16 big endian input: 1110aaaa 10bbbbcc 10ccdddd represents the 16bit big endian integer ccccddddaaaabbbb, i.e. 0xE4, 0x84, 0x80 => 0x00, 0x41 => "A" in UTF-16 BE
// The Samsung encoder seems to create some illegal UTF-8 sequences at the end of the string as a result of padding and operating on 32bit inputs (2 characters) with big-endianness, which
// this decoder has to take care of. 0xFFFD can appear both in the raw input bytes (0xFF, 0xFB) as well as already encoded into UTF-8 wrappings (0xEF,0xBF,0xBD)
// This implementation here decodes the UTF-8 byte sequence into UTF-16 Little Endian for the sake of simplicity: aaaa=4, bbbb=1, cccc=0, dddd=0 => 0xE4, 0x84, 0x80 => 0x41, 0x00 => "A" in UTF-16 LE.
// The encoder here operates on 16bit characters and not 32bit 2-characters, so there is no need for padding and no invalid UTF-8 sequences.
public class Utf16InsideUtf8EnvelopeEncoding : Encoding
{
public override int GetMaxByteCount(int charCount)
{
return charCount * 3;
}
public override int GetByteCount(char[] chars, int index, int count)
{
return count * 3;
}
public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex)
{
var utf16Le = Unicode.GetBytes(chars, charIndex, charCount);
int o = byteIndex;
int c = utf16Le.Length;
int i;
for (i = 0; i < c; i += 2, o += 3)
{
var b0 = utf16Le[i + 0];
var b1 = utf16Le[i + 1];
bytes[o + 0] = (byte) (0xE0 + (b0 >> 4));
bytes[o + 1] = (byte) (0x80 + ((b0 & 0x0F) << 2) + (b1 >> 6));
bytes[o + 2] = (byte) (0x80 + (b1 & 0x3F));
}
return charCount * 3;
}
public override int GetMaxCharCount(int byteCount)
{
return (byteCount + 2) / 3;
}
public override int GetCharCount(byte[] bytes, int index, int count)
{
return (count + 2) / 3;
}
public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex)
{
using MemoryStream ms = new MemoryStream(40);
for (int i = byteIndex, c = byteIndex + byteCount; i < c; i++)
{
int b0 = bytes[i + 0];
if (b0 == 0 && i == c - 1) // satellite names end with a single trailing 0x00 byte
break;
if (b0 > 0xF7) // invalid UTF-8 lead byte. (0xFF, 0xFD) = 0xFFFD in BigEndian can appear unencoded at the end of the byte stream, likely as a padding
continue;
if (b0 >= 0xE0) // 3-byte UTF envelope for 2 input bytes
{
int b1 = bytes[i + 1];
int b2 = bytes[i + 2];
if ((b2 & 0xC0) != 0x80) // invalid 2nd UTF-8 continuation byte; only a single byte is encoded as 1110aaaa 10bbbbcc => aaaabbbb
{
b2 = 0;
--i;
}
int ch1 = ((b0 & 0x0F) << 4) | ((b1 & 0x3C) >> 2);
int ch2 = ((b1 & 0x03) << 6) | (b2 & 0x3F);
if (ch1 != 0xFF || ch2 != 0xFD) // ignore UTF-16 "replacement character" U-0xFFFD
{
ms.WriteByte((byte) ch1);
ms.WriteByte((byte) ch2);
}
i += 2;
}
else if (b0 >= 0xC0) // 2-byte UTF envelope for 1 input byte as 110xaaaa 10bbbbcc => aaaabbbb
{
int b1 = bytes[i + 1];
int ch = ((b0 & 0x0F) << 4) | ((b1 & 0x3C)>>2);
ms.WriteByte((byte)ch);
ms.WriteByte(0);
i++;
}
else if (b0 < 0x80) // 1-byte UTF envelope for 1 input byte < 0x80
{
ms.WriteByte(bytes[i]);
ms.WriteByte(0);
}
}
return Encoding.Unicode.GetChars(ms.GetBuffer(), 0, (int) ms.Length, chars, charIndex);
}
}
}

View File

@@ -1,6 +1,4 @@
using System;
using System.Net;
using System.Net.Security;
using System.Net;
using System.Threading;
using ChanSort.Ui.Properties;
using DevExpress.XtraEditors;
@@ -40,11 +38,9 @@ namespace ChanSort.Ui
//Change SSL checks so that all checks pass
//ServicePointManager.ServerCertificateValidationCallback = delegate { return true; };
ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12;
using (WebClient client = new WebClient())
{
client.Proxy = null; // prevent a 1min wait/timeout by a .NET bug
response = client.DownloadString(UpdateUrl);
}
using WebClient client = new WebClient();
client.Proxy = null; // prevent a 1min wait/timeout by a .NET bug
response = client.DownloadString(UpdateUrl);
}
finally
{

View File

@@ -1,6 +1,11 @@
ChanSort Change Log
===================
2020-07-13
- Samsung 1242 format: channel names were displayed as chinese letters instead of latin
(Names are not stored as characters in this format, but instead 16 bits of UTF16 code points are encoded as "payload"
inside 3 byte UTF-8 sequences)
2020-07-12
- added UTF-16 Big Endian and Little Endian options to character set menu
- Samsung .zip loader: auto-detect UTF-16 endianness and allow to change encoding after loading to UTF-16 LE/BE