feat: cache sharedstrings in sqlite

This commit is contained in:
罗威 2022-03-24 23:46:54 +08:00
parent bbe29ae1b3
commit 65ad0f7e4c
8 changed files with 280 additions and 34 deletions

175
src/MiniExcel/DbList.cs Normal file
View File

@ -0,0 +1,175 @@
using System;
using System.Collections;
using System.Collections.Generic;
using System.Data.SQLite;
using System.IO;
using System.Text;
namespace MiniExcelLibs
{
public class DbList : IList<string>, IDisposable
{
private SQLiteConnection _conn;
private SQLiteCommand _cmd;
private string _name;
private const string _tableName = "sharedStrings";
public DbList(string name)
{
_name = name;
_conn = new SQLiteConnection($"Data Source={name}.db;Version=3;");
_conn.Open();
_cmd = _conn.CreateCommand();
CreateTable();
}
private void CreateTable()
{
Clear();
_cmd.CommandText = $@"
CREATE TABLE {_tableName} (name TEXT, `index` INTEGER);
CREATE UNIQUE INDEX idx_index
ON sharedStrings (
`index`
);
CREATE INDEX idx_name
ON sharedStrings (
name
);";
_cmd.ExecuteNonQuery();
}
public IEnumerator<string> GetEnumerator()
{
throw new System.NotImplementedException();
}
IEnumerator IEnumerable.GetEnumerator()
{
throw new System.NotImplementedException();
}
public void Add(string item)
{
var maxIndex = GetMaxIndex();
_cmd.CommandText = $"INSERT INTO {_tableName}(name, `index`) VALUES ('{item}', {maxIndex + 1})";
_cmd.ExecuteNonQuery();
}
private long GetMaxIndex()
{
_cmd.CommandText = $"SELECT MAX(`index`) FROM {_tableName}";
var result = _cmd.ExecuteScalar();
if (result == DBNull.Value)
return -1;
return (long)result;
}
public void Clear()
{
_cmd.CommandText = $"DROP TABLE IF EXISTS {_tableName}";
_cmd.ExecuteNonQuery();
}
public bool Contains(string item)
{
_cmd.CommandText = $"SELECT * FROM {_tableName} WHERE name = '{item}'";
return _cmd.ExecuteScalar() != null;
}
public void CopyTo(string[] array, int arrayIndex)
{
throw new System.NotImplementedException();
}
public void AddRange(List<string> array)
{
var maxIndex = GetMaxIndex();
var cmdTxt = new StringBuilder();
cmdTxt.Append($"INSERT INTO {_tableName}(name, `index`) VALUES");
for (var i = 0; i < array.Count; i++)
{
var item = array[i];
cmdTxt.Append($"('{item}', {maxIndex + i + 1})");
cmdTxt.Append(i != array.Count - 1 ? ',' : ';');
}
_cmd.CommandText = cmdTxt.ToString();
_cmd.ExecuteNonQuery();
}
public bool Remove(string item)
{
_cmd.CommandText = $"DELETE FROM {_tableName} WHERE name = '{item}'";
return _cmd.ExecuteNonQuery() > 0;
}
public int Count
{
get
{
_cmd.CommandText = "SELECT COUNT(*) FROM " + _tableName;
return Convert.ToInt32(_cmd.ExecuteScalar());
}
}
public bool IsReadOnly { get; }
public int IndexOf(string item)
{
_cmd.CommandText = $"SELECT `index` FROM {_tableName} WHERE name = '{item}'";
return (int)_cmd.ExecuteScalar();
}
public void Insert(int index, string item)
{
_cmd.CommandText = $"UPDATE {_tableName} SET `index` = `index` + 1 WHERE index >= {index}";
_cmd.ExecuteNonQuery();
_cmd.CommandText = $"INSERT INTO {_tableName}(name, `index`) VALUES ('{item}', {index})";
_cmd.ExecuteNonQuery();
}
public void RemoveAt(int index)
{
_cmd.CommandText = $"DELETE FROM {_tableName} WHERE `index` = {index}";
_cmd.ExecuteNonQuery();
}
public string this[int index]
{
get
{
_cmd.CommandText = $"SELECT name FROM {_tableName} WHERE `index` = {index}";
return (string)_cmd.ExecuteScalar();
}
set
{
_cmd.CommandText = $"UPDATE {_tableName} SET name = '{value}' WHERE `index` = {index}";
_cmd.ExecuteNonQuery();
}
}
public void Dispose()
{
if (_cmd != null)
{
_cmd.Dispose();
_cmd = null;
}
if (_conn != null)
{
_conn.Dispose();
_conn = null;
}
File.Delete($"{_name}.db");
}
}
}

View File

@ -44,5 +44,6 @@ Todo : https://github.com/shps951023/MiniExcel/projects/1?fullscreen=true
</ItemGroup>
<ItemGroup>
<PackageReference Include="ExcelNumberFormat" Version="1.1.0" />
<PackageReference Include="System.Data.SQLite.Core" Version="1.0.115.5" />
</ItemGroup>
</Project>

View File

@ -17,7 +17,7 @@ namespace MiniExcelLibs.OpenXml
private static readonly string[] _ns = { Config.SpreadsheetmlXmlns, Config.SpreadsheetmlXmlStrictns };
private static readonly string[] _relationshiopNs = { Config.SpreadsheetmlXmlRelationshipns, Config.SpreadsheetmlXmlStrictRelationshipns };
private List<SheetRecord> _sheetRecords;
private List<string> _sharedStrings;
private IList<string> _sharedStrings;
private MergeCells _mergeCells;
private ExcelOpenXmlStyles _style;
private readonly ExcelOpenXmlZip _archive;
@ -490,42 +490,44 @@ namespace MiniExcelLibs.OpenXml
return;
using (var stream = sharedStringsEntry.Open())
{
_sharedStrings = GetSharedStrings(stream).ToList();
}
if (_config.EnableSharedStringCache && sharedStringsEntry.Length >= _config.SharedStringCacheSize)
{
// use sqlite
var dbList = new DbList(Guid.NewGuid().ToString());
var list = new List<string>();
foreach (var sharedString in XmlReaderHelper.GetSharedStrings(stream, _ns))
{
list.Add(sharedString);
if (list.Count >= 10000)
{
dbList.AddRange(list);
list.Clear();
}
}
if (list.Count > 0)
{
dbList.AddRange(list);
list.Clear();
}
_sharedStrings = dbList;
}
else
{
_sharedStrings = XmlReaderHelper.GetSharedStrings(stream, _ns).ToList();
}
}
}
internal List<string> GetSharedStrings()
internal IList<string> GetSharedStrings()
{
if (_sharedStrings == null)
SetSharedStrings();
return _sharedStrings;
}
private IEnumerable<string> GetSharedStrings(Stream stream)
{
using (var reader = XmlReader.Create(stream))
{
if (!XmlReaderHelper.IsStartElement(reader, "sst", _ns))
yield break;
if (!XmlReaderHelper.ReadFirstContent(reader))
yield break;
while (!reader.EOF)
{
if (XmlReaderHelper.IsStartElement(reader, "si", _ns))
{
var value = StringHelper.ReadStringItem(reader);
yield return value;
}
else if (!XmlReaderHelper.SkipContent(reader))
{
break;
}
}
}
}
private void SetWorkbookRels(ReadOnlyCollection<ZipArchiveEntry> entries)
{
if (_sheetRecords != null)

View File

@ -90,7 +90,7 @@ namespace MiniExcelLibs.OpenXml
private Dictionary<string, XMergeCell> XMergeCellInfos { get; set; }
public List<XMergeCell> NewXMergeCellInfos { get; private set; }
private void GenerateSheetXmlImpl(ZipArchiveEntry sheetZipEntry, Stream stream, Stream sheetStream, Dictionary<string, object> inputMaps, List<string> sharedStrings, XmlWriterSettings xmlWriterSettings = null)
private void GenerateSheetXmlImpl(ZipArchiveEntry sheetZipEntry, Stream stream, Stream sheetStream, Dictionary<string, object> inputMaps, IList<string> sharedStrings, XmlWriterSettings xmlWriterSettings = null)
{
var doc = new XmlDocument();
doc.Load(sheetStream);
@ -387,7 +387,7 @@ namespace MiniExcelLibs.OpenXml
.Replace($"xmlns{endPrefix}=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\"", "");
}
private void ReplaceSharedStringsToStr(List<string> sharedStrings, ref XmlNodeList rows)
private void ReplaceSharedStringsToStr(IList<string> sharedStrings, ref XmlNodeList rows)
{
foreach (XmlElement row in rows)
{

View File

@ -11,5 +11,9 @@ namespace MiniExcelLibs.OpenXml
public bool AutoFilter { get; set; } = true;
public bool EnableConvertByteArray { get; set; } = true;
public bool IgnoreTemplateParameterMissing { get; set; } = true;
// currently, this is a preview functional
public bool EnableSharedStringCache { get; set; } = false;
public int SharedStringCacheSize { get; set; } = 5 * 1024 * 1024;
}
}

View File

@ -1,4 +1,6 @@
using System.Linq;
using System.Collections.Generic;
using System.IO;
using System.Linq;
namespace MiniExcelLibs.Utils
{
@ -74,6 +76,31 @@ namespace MiniExcelLibs.Utils
return null;
}
public static IEnumerable<string> GetSharedStrings(Stream stream, params string[] nss)
{
using (var reader = XmlReader.Create(stream))
{
if (!XmlReaderHelper.IsStartElement(reader, "sst", nss))
yield break;
if (!XmlReaderHelper.ReadFirstContent(reader))
yield break;
while (!reader.EOF)
{
if (XmlReaderHelper.IsStartElement(reader, "si", nss))
{
var value = StringHelper.ReadStringItem(reader);
yield return value;
}
else if (!XmlReaderHelper.SkipContent(reader))
{
break;
}
}
}
}
}
}

View File

@ -9,17 +9,25 @@ using System.Data;
using ExcelDataReader;
using System.Collections.Generic;
using System.Data.SQLite;
using System.Diagnostics;
using Dapper;
using System.Globalization;
using static MiniExcelLibs.Tests.Utils.MiniExcelOpenXml;
using MiniExcelLibs.Tests.Utils;
using MiniExcelLibs.Attributes;
using MiniExcelLibs.OpenXml;
using Xunit.Abstractions;
namespace MiniExcelLibs.Tests
{
public partial class MiniExcelOpenXmlTests
{
private readonly ITestOutputHelper output;
public MiniExcelOpenXmlTests(ITestOutputHelper output)
{
this.output = output;
}
[Fact]
public void GetColumnsTest()
{
@ -1161,5 +1169,34 @@ namespace MiniExcelLibs.Tests
Assert.Equal(rows[1].B , "value2");
Assert.Equal(rows[1].C , "value3");
}
[Fact]
public void SharedStringCacheTest()
{
var path = "../../../../../benchmarks/MiniExcel.Benchmarks/Test1,000,000x10_SharingStrings.xlsx";
var stopWatch = new Stopwatch();
stopWatch.Start();
MiniExcel.Query(path).First();
Process currentProcess = Process.GetCurrentProcess();
long totalBytesOfMemoryUsed = currentProcess.WorkingSet64;
output.WriteLine("totalBytesOfMemoryUsed: " + totalBytesOfMemoryUsed);
output.WriteLine("elapsedMilliseconds: " + stopWatch.ElapsedMilliseconds);
stopWatch.Stop();
}
[Fact]
public void SharedStringNoCacheTest()
{
var path = "../../../../../benchmarks/MiniExcel.Benchmarks/Test1,000,000x10_SharingStrings.xlsx";
var stopWatch = new Stopwatch();
stopWatch.Start();
MiniExcel.Query(path,
configuration: new OpenXmlConfiguration() { SharedStringCacheSize = int.MaxValue }).First();
Process currentProcess = Process.GetCurrentProcess();
long totalBytesOfMemoryUsed = currentProcess.WorkingSet64;
output.WriteLine("totalBytesOfMemoryUsed: " + totalBytesOfMemoryUsed);
output.WriteLine("elapsedMilliseconds: " + stopWatch.ElapsedMilliseconds);
stopWatch.Stop();
}
}
}

View File

@ -21,7 +21,7 @@
<PackageReference Include="ExcelDataReader" Version="3.6.0" />
<PackageReference Include="ExcelDataReader.DataSet" Version="3.6.0" />
<PackageReference Include="Newtonsoft.Json" Version="13.0.1" />
<PackageReference Include="System.Data.SQLite.Core" Version="1.0.113.7" />
<PackageReference Include="System.Data.SQLite.Core" Version="1.0.115.5" />
<PackageReference Include="System.Text.Encoding.CodePages" Version="5.0.0" />
<PackageReference Include="xunit" Version="2.4.1" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.4.1">