mirror of
https://github.com/PredatH0r/ChanSort.git
synced 2026-05-07 18:15:34 +02:00
Samsung 1242 format uses 16-bits UTF16 as payload inside 3 byte UTF8 sequences.
Characters showed up as Chinese when the raw data was interpreted directly as UTF16 (both little and big endian)
This commit is contained in:
@@ -75,6 +75,7 @@
|
||||
<Compile Include="DbSerializer.cs" />
|
||||
<Compile Include="DbSerializerPlugin.cs" />
|
||||
<Compile Include="Properties\AssemblyInfo.cs" />
|
||||
<Compile Include="Utf16InsideUtf8EnvelopeEncoding.cs" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\ChanSort.Api\ChanSort.Api.csproj">
|
||||
|
||||
@@ -4,13 +4,14 @@ using System.Data;
|
||||
using System.Data.SQLite;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Text;
|
||||
using ChanSort.Api;
|
||||
|
||||
namespace ChanSort.Loader.SamsungJ
|
||||
{
|
||||
/// <summary>
|
||||
/// Loader for Samsung J/K/M/N/R/Q series .zip files (2015 - 2019+)
|
||||
/// Loader for Samsung J/K/M/N/R/Q series .zip files (2015 - 2020)
|
||||
/// </summary>
|
||||
class DbSerializer : SerializerBase
|
||||
{
|
||||
@@ -218,11 +219,15 @@ namespace ChanSort.Loader.SamsungJ
|
||||
try
|
||||
{
|
||||
cmd.CommandText = "select provId, cast(provName as blob) from PROV";
|
||||
var prevEncoding = this.encoding;
|
||||
this.encoding = Encoding.BigEndianUnicode; // while Sat and Service names might be utf16 binary data inside an utf8 envelope, the providers are always plain utf16
|
||||
using (var r = cmd.ExecuteReader())
|
||||
{
|
||||
while (r.Read())
|
||||
dict.Add(r.GetInt64(0), ReadUtf16(r, 1));
|
||||
}
|
||||
|
||||
this.encoding = prevEncoding;
|
||||
}
|
||||
catch
|
||||
{
|
||||
@@ -392,7 +397,7 @@ namespace ChanSort.Loader.SamsungJ
|
||||
return null;
|
||||
byte[] nameBytes = new byte[200];
|
||||
int nameLen = (int)r.GetBytes(fieldIndex, 0, nameBytes, 0, nameBytes.Length);
|
||||
this.encoding ??= AutoDetectUtf16Endian(nameBytes, nameLen);
|
||||
this.encoding ??= AutoDetectUtf16Encoding(nameBytes, nameLen);
|
||||
if (this.encoding == null)
|
||||
return string.Empty;
|
||||
|
||||
@@ -401,24 +406,35 @@ namespace ChanSort.Loader.SamsungJ
|
||||
#endregion
|
||||
|
||||
#region AutoDetectUtf16Endian()
|
||||
private Encoding AutoDetectUtf16Endian(byte[] nameBytes, int nameLen)
|
||||
private Encoding AutoDetectUtf16Encoding(byte[] nameBytes, int nameLen)
|
||||
{
|
||||
if (this.DefaultEncoding is UnicodeEncoding)
|
||||
return this.DefaultEncoding;
|
||||
|
||||
int evenBytesZero = 0;
|
||||
int oddBytesZero = 0;
|
||||
int bytesAbove128 = 0;
|
||||
for (int i = 0; i < nameLen; i += 2)
|
||||
{
|
||||
if (nameBytes[i] == 0)
|
||||
++evenBytesZero;
|
||||
if (nameBytes[i] >= 128)
|
||||
++bytesAbove128;
|
||||
if (nameBytes[i + 1] == 0)
|
||||
++oddBytesZero;
|
||||
if (nameBytes[i + 1] >= 128)
|
||||
++bytesAbove128;
|
||||
}
|
||||
|
||||
if (evenBytesZero + oddBytesZero == nameLen)
|
||||
return null;
|
||||
|
||||
if (bytesAbove128 + 1 >= nameLen)
|
||||
{
|
||||
//this.Features.ChannelNameEdit = ChannelNameEditMode.None; // unclear if the encoder produces byte sequences that the TV can decode again
|
||||
return new Utf16InsideUtf8EnvelopeEncoding();
|
||||
}
|
||||
|
||||
return evenBytesZero >= oddBytesZero ? Encoding.BigEndianUnicode : Encoding.Unicode;
|
||||
}
|
||||
|
||||
@@ -482,45 +498,44 @@ namespace ChanSort.Loader.SamsungJ
|
||||
#region SaveChannelList()
|
||||
private void SaveChannelList(ChannelList channelList, string dbPath)
|
||||
{
|
||||
using (var conn = new SQLiteConnection("Data Source=" + dbPath))
|
||||
using var conn = new SQLiteConnection("Data Source=" + dbPath);
|
||||
conn.Open();
|
||||
using var cmdUpdateSrv = PrepareUpdateCommand(conn);
|
||||
using var cmdDeleteSrv = PrepareDeleteCommand(conn, (channelList.SignalSource & SignalSource.Digital) != 0);
|
||||
using var cmdInsertFav = PrepareInsertFavCommand(conn);
|
||||
using var cmdUpdateFav = PrepareUpdateFavCommand(conn);
|
||||
using var cmdDeleteFav = PrepareDeleteFavCommand(conn);
|
||||
using (var trans = conn.BeginTransaction())
|
||||
{
|
||||
conn.Open();
|
||||
using (var cmdUpdateSrv = PrepareUpdateCommand(conn))
|
||||
using (var cmdDeleteSrv = PrepareDeleteCommand(conn, (channelList.SignalSource & SignalSource.Digital) != 0))
|
||||
using (var cmdInsertFav = PrepareInsertFavCommand(conn))
|
||||
using (var cmdUpdateFav = PrepareUpdateFavCommand(conn))
|
||||
using (var cmdDeleteFav = PrepareDeleteFavCommand(conn))
|
||||
{
|
||||
using (var trans = conn.BeginTransaction())
|
||||
{
|
||||
Editor.SequentializeFavPos(channelList, 5);
|
||||
this.WriteChannels(cmdUpdateSrv, cmdDeleteSrv, cmdInsertFav, cmdUpdateFav, cmdDeleteFav, channelList);
|
||||
trans.Commit();
|
||||
}
|
||||
this.RepairCorruptedDatabaseImage(cmdUpdateSrv);
|
||||
}
|
||||
Editor.SequentializeFavPos(channelList, 5);
|
||||
this.WriteChannels(cmdUpdateSrv, cmdDeleteSrv, cmdInsertFav, cmdUpdateFav, cmdDeleteFav, channelList);
|
||||
trans.Commit();
|
||||
}
|
||||
this.RepairCorruptedDatabaseImage(cmdUpdateSrv);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Prepare*Command()
|
||||
|
||||
private static SQLiteCommand PrepareUpdateCommand(SQLiteConnection conn)
|
||||
private SQLiteCommand PrepareUpdateCommand(SQLiteConnection conn)
|
||||
{
|
||||
var canUpdateNames = this.Features.ChannelNameEdit != ChannelNameEditMode.None;
|
||||
var cmd = conn.CreateCommand();
|
||||
cmd.CommandText = "update SRV set major=@nr, lockMode=@lock, hideGuide=@hidden, hidden=@hidden, numSel=@numsel, srvName=cast(@srvname as varchar) where srvId=@id";
|
||||
var updateSrvName = canUpdateNames ? ", srvName=cast(@srvname as varchar)" : "";
|
||||
cmd.CommandText = "update SRV set major=@nr, lockMode=@lock, hideGuide=@hidden, hidden=@hidden, numSel=@numsel" + updateSrvName + " where srvId=@id";
|
||||
cmd.Parameters.Add(new SQLiteParameter("@id", DbType.Int64));
|
||||
cmd.Parameters.Add(new SQLiteParameter("@nr", DbType.Int32));
|
||||
cmd.Parameters.Add(new SQLiteParameter("@lock", DbType.Boolean));
|
||||
cmd.Parameters.Add(new SQLiteParameter("@hidden", DbType.Boolean));
|
||||
cmd.Parameters.Add(new SQLiteParameter("@numsel", DbType.Boolean));
|
||||
cmd.Parameters.Add(new SQLiteParameter("@srvname", DbType.Binary));
|
||||
if (canUpdateNames)
|
||||
cmd.Parameters.Add(new SQLiteParameter("@srvname", DbType.Binary));
|
||||
cmd.Prepare();
|
||||
return cmd;
|
||||
}
|
||||
|
||||
private static SQLiteCommand PrepareDeleteCommand(SQLiteConnection conn, bool digital)
|
||||
private SQLiteCommand PrepareDeleteCommand(SQLiteConnection conn, bool digital)
|
||||
{
|
||||
var cmd = conn.CreateCommand();
|
||||
var sql = new StringBuilder();
|
||||
@@ -536,7 +551,7 @@ namespace ChanSort.Loader.SamsungJ
|
||||
return cmd;
|
||||
}
|
||||
|
||||
private static SQLiteCommand PrepareInsertFavCommand(SQLiteConnection conn)
|
||||
private SQLiteCommand PrepareInsertFavCommand(SQLiteConnection conn)
|
||||
{
|
||||
var cmd = conn.CreateCommand();
|
||||
cmd.CommandText = "insert into SRV_FAV (srvId, fav, pos) values (@id, @fav, @pos)";
|
||||
@@ -547,7 +562,7 @@ namespace ChanSort.Loader.SamsungJ
|
||||
return cmd;
|
||||
}
|
||||
|
||||
private static SQLiteCommand PrepareUpdateFavCommand(SQLiteConnection conn)
|
||||
private SQLiteCommand PrepareUpdateFavCommand(SQLiteConnection conn)
|
||||
{
|
||||
var cmd = conn.CreateCommand();
|
||||
cmd.CommandText = "update SRV_FAV set pos=@pos where srvId=@id and fav=@fav";
|
||||
@@ -557,7 +572,7 @@ namespace ChanSort.Loader.SamsungJ
|
||||
cmd.Prepare();
|
||||
return cmd;
|
||||
}
|
||||
private static SQLiteCommand PrepareDeleteFavCommand(SQLiteConnection conn)
|
||||
private SQLiteCommand PrepareDeleteFavCommand(SQLiteConnection conn)
|
||||
{
|
||||
var cmd = conn.CreateCommand();
|
||||
cmd.CommandText = "delete from SRV_FAV where srvId=@id and fav=@fav";
|
||||
@@ -573,7 +588,7 @@ namespace ChanSort.Loader.SamsungJ
|
||||
private void WriteChannels(SQLiteCommand cmdUpdateSrv, SQLiteCommand cmdDeleteSrv, SQLiteCommand cmdInsertFav, SQLiteCommand cmdUpdateFav, SQLiteCommand cmdDeleteFav,
|
||||
ChannelList channelList, bool analog = false)
|
||||
{
|
||||
|
||||
bool canUpdateNames = this.Features.ChannelNameEdit != ChannelNameEditMode.None;
|
||||
foreach (ChannelInfo channelInfo in channelList.Channels.ToList())
|
||||
{
|
||||
var channel = channelInfo as DbChannel;
|
||||
@@ -595,7 +610,8 @@ namespace ChanSort.Loader.SamsungJ
|
||||
cmdUpdateSrv.Parameters["@lock"].Value = channel.Lock;
|
||||
cmdUpdateSrv.Parameters["@hidden"].Value = channel.Hidden;
|
||||
cmdUpdateSrv.Parameters["@numsel"].Value = !channel.Skip;
|
||||
cmdUpdateSrv.Parameters["@srvname"].Value = channel.Name == null ? (object)DBNull.Value : encoding.GetBytes(channel.Name);
|
||||
if (canUpdateNames)
|
||||
cmdUpdateSrv.Parameters["@srvname"].Value = channel.Name == null ? (object)DBNull.Value : encoding.GetBytes(channel.Name);
|
||||
cmdUpdateSrv.ExecuteNonQuery();
|
||||
|
||||
// update favorites
|
||||
|
||||
@@ -0,0 +1,101 @@
|
||||
using System.IO;
|
||||
using System.Text;
|
||||
|
||||
namespace ChanSort.Loader.SamsungJ
|
||||
{
|
||||
// Samsung 1242 format does not store UTF16 characters directly, but instead wraps 16 data bits inside a UTF-8 lead + continuation byte sequence.
|
||||
// A 3 byte UTF-8 sequence is used to encode 16 bits of utf-16 big endian input: 1110aaaa 10bbbbcc 10ccdddd represents the 16bit big endian integer ccccddddaaaabbbb, i.e. 0xE4, 0x84, 0x80 => 0x00, 0x41 => "A" in UTF-16 BE
|
||||
// The Samsung encoder seems to create some illegal UTF-8 sequences at the end of the string as a result of padding and operating on 32bit inputs (2 characters) with big-endianness, which
|
||||
// this decoder has to take care of. 0xFFFD can appear both in the raw input bytes (0xFF, 0xFB) as well as already encoded into UTF-8 wrappings (0xEF,0xBF,0xBD)
|
||||
|
||||
// This implementation here decodes the UTF-8 byte sequence into UTF-16 Little Endian for the sake of simplicity: aaaa=4, bbbb=1, cccc=0, dddd=0 => 0xE4, 0x84, 0x80 => 0x41, 0x00 => "A" in UTF-16 LE.
|
||||
// The encoder here operates on 16bit characters and not 32bit 2-characters, so there is no need for padding and no invalid UTF-8 sequences.
|
||||
|
||||
public class Utf16InsideUtf8EnvelopeEncoding : Encoding
|
||||
{
|
||||
public override int GetMaxByteCount(int charCount)
|
||||
{
|
||||
return charCount * 3;
|
||||
}
|
||||
|
||||
public override int GetByteCount(char[] chars, int index, int count)
|
||||
{
|
||||
return count * 3;
|
||||
}
|
||||
|
||||
public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex)
|
||||
{
|
||||
var utf16Le = Unicode.GetBytes(chars, charIndex, charCount);
|
||||
int o = byteIndex;
|
||||
int c = utf16Le.Length;
|
||||
int i;
|
||||
for (i = 0; i < c; i += 2, o += 3)
|
||||
{
|
||||
var b0 = utf16Le[i + 0];
|
||||
var b1 = utf16Le[i + 1];
|
||||
bytes[o + 0] = (byte) (0xE0 + (b0 >> 4));
|
||||
bytes[o + 1] = (byte) (0x80 + ((b0 & 0x0F) << 2) + (b1 >> 6));
|
||||
bytes[o + 2] = (byte) (0x80 + (b1 & 0x3F));
|
||||
}
|
||||
|
||||
return charCount * 3;
|
||||
}
|
||||
|
||||
|
||||
public override int GetMaxCharCount(int byteCount)
|
||||
{
|
||||
return (byteCount + 2) / 3;
|
||||
}
|
||||
|
||||
public override int GetCharCount(byte[] bytes, int index, int count)
|
||||
{
|
||||
return (count + 2) / 3;
|
||||
}
|
||||
|
||||
public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex)
|
||||
{
|
||||
using MemoryStream ms = new MemoryStream(40);
|
||||
for (int i = byteIndex, c = byteIndex + byteCount; i < c; i++)
|
||||
{
|
||||
int b0 = bytes[i + 0];
|
||||
if (b0 == 0 && i == c - 1) // satellite names end with a single trailing 0x00 byte
|
||||
break;
|
||||
if (b0 > 0xF7) // invalid UTF-8 lead byte. (0xFF, 0xFD) = 0xFFFD in BigEndian can appear unencoded at the end of the byte stream, likely as a padding
|
||||
continue;
|
||||
if (b0 >= 0xE0) // 3-byte UTF envelope for 2 input bytes
|
||||
{
|
||||
int b1 = bytes[i + 1];
|
||||
int b2 = bytes[i + 2];
|
||||
if ((b2 & 0xC0) != 0x80) // invalid 2nd UTF-8 continuation byte; only a single byte is encoded as 1110aaaa 10bbbbcc => aaaabbbb
|
||||
{
|
||||
b2 = 0;
|
||||
--i;
|
||||
}
|
||||
int ch1 = ((b0 & 0x0F) << 4) | ((b1 & 0x3C) >> 2);
|
||||
int ch2 = ((b1 & 0x03) << 6) | (b2 & 0x3F);
|
||||
if (ch1 != 0xFF || ch2 != 0xFD) // ignore UTF-16 "replacement character" U-0xFFFD
|
||||
{
|
||||
ms.WriteByte((byte) ch1);
|
||||
ms.WriteByte((byte) ch2);
|
||||
}
|
||||
i += 2;
|
||||
}
|
||||
else if (b0 >= 0xC0) // 2-byte UTF envelope for 1 input byte as 110xaaaa 10bbbbcc => aaaabbbb
|
||||
{
|
||||
int b1 = bytes[i + 1];
|
||||
int ch = ((b0 & 0x0F) << 4) | ((b1 & 0x3C)>>2);
|
||||
ms.WriteByte((byte)ch);
|
||||
ms.WriteByte(0);
|
||||
i++;
|
||||
}
|
||||
else if (b0 < 0x80) // 1-byte UTF envelope for 1 input byte < 0x80
|
||||
{
|
||||
ms.WriteByte(bytes[i]);
|
||||
ms.WriteByte(0);
|
||||
}
|
||||
}
|
||||
|
||||
return Encoding.Unicode.GetChars(ms.GetBuffer(), 0, (int) ms.Length, chars, charIndex);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,4 @@
|
||||
using System;
|
||||
using System.Net;
|
||||
using System.Net.Security;
|
||||
using System.Net;
|
||||
using System.Threading;
|
||||
using ChanSort.Ui.Properties;
|
||||
using DevExpress.XtraEditors;
|
||||
@@ -40,11 +38,9 @@ namespace ChanSort.Ui
|
||||
//Change SSL checks so that all checks pass
|
||||
//ServicePointManager.ServerCertificateValidationCallback = delegate { return true; };
|
||||
ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12;
|
||||
using (WebClient client = new WebClient())
|
||||
{
|
||||
client.Proxy = null; // prevent a 1min wait/timeout by a .NET bug
|
||||
response = client.DownloadString(UpdateUrl);
|
||||
}
|
||||
using WebClient client = new WebClient();
|
||||
client.Proxy = null; // prevent a 1min wait/timeout by a .NET bug
|
||||
response = client.DownloadString(UpdateUrl);
|
||||
}
|
||||
finally
|
||||
{
|
||||
|
||||
@@ -1,6 +1,11 @@
|
||||
ChanSort Change Log
|
||||
===================
|
||||
|
||||
2020-07-13
|
||||
- Samsung 1242 format: channel names were displayed as chinese letters instead of latin
|
||||
(Names are not stored as characters in this format, but instead 16 bits of UTF16 code points are encoded as "payload"
|
||||
inside 3 byte UTF-8 sequences)
|
||||
|
||||
2020-07-12
|
||||
- added UTF-16 Big Endian and Little Endian options to character set menu
|
||||
- Samsung .zip loader: auto-detect UTF-16 endianness and allow to change encoding after loading to UTF-16 LE/BE
|
||||
|
||||
Reference in New Issue
Block a user