Reduce memory requirements when processing templates + template formulas (#638)

* fix for issue 606

* fix formatting
This commit is contained in:
meld-cp 2024-07-21 00:27:13 +12:00 committed by GitHub
parent 9de96abba8
commit 92170f1fc2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 130 additions and 47 deletions

Binary file not shown.

View File

@ -6,5 +6,6 @@
public const string SpreadsheetmlXmlStrictns = "http://purl.oclc.org/ooxml/spreadsheetml/main";
public const string SpreadsheetmlXmlRelationshipns = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
public const string SpreadsheetmlXmlStrictRelationshipns = "http://purl.oclc.org/ooxml/officeDocument/relationships";
public const string SpreadsheetmlXml_x14ac = "http://schemas.microsoft.com/office/spreadsheetml/2009/9/ac";
}
}

View File

@ -105,7 +105,9 @@ namespace MiniExcelLibs.OpenXml
}
}
private List<XRowInfo> XRowInfos { get; set; }
private List<XRowInfo> XRowInfos { get; set; }
private readonly List<string> CalcChainCellRefs = new List<string>();
private Dictionary<string, XMergeCell> XMergeCellInfos { get; set; }
public List<XMergeCell> NewXMergeCellInfos { get; private set; }
@ -688,7 +690,10 @@ namespace MiniExcelLibs.OpenXml
var mergeBaseRowIndex = newRowIndex;
newRowIndex += rowInfo.IEnumerableMercell?.Height ?? 1;
writer.Write(CleanXml(rowXml, endPrefix)); // pass StringBuilder for netcoreapp3.0 or above
// replace formulas
ProcessFormulas( rowXml, newRowIndex );
writer.Write(CleanXml( rowXml, endPrefix)); // pass StringBuilder for netcoreapp3.0 or above
//mergecells
if (rowInfo.RowMercells != null)
@ -743,30 +748,6 @@ namespace MiniExcelLibs.OpenXml
else
{
// convert cells starting with '$=' into formulas
var cs = row.SelectNodes($"x:c", _ns);
foreach (XmlElement c in cs)
{
/* Target:
<c r="C8" s="3">
<f>SUM(C2:C7)</f>
</c>
*/
var vs = c.SelectNodes($"x:v", _ns);
foreach (XmlElement v in vs)
{
if (!v.InnerText.StartsWith("$="))
{
continue;
}
var fNode = c.OwnerDocument.CreateElement("f", Config.SpreadsheetmlXmlns);
fNode.InnerText = v.InnerText.Substring(2);
c.InsertBefore(fNode, v);
c.RemoveChild(v);
}
}
innerXml = row.InnerXml;
rowXml.Clear()
.Append(outerXmlOpen)
.AppendFormat(@" r=""{0}"">", newRowIndex)
@ -775,7 +756,10 @@ namespace MiniExcelLibs.OpenXml
.Replace($"{{{{$enumrowstart}}}}", enumrowstart.ToString())
.Replace($"{{{{$enumrowend}}}}", enumrowend.ToString())
.AppendFormat("</{0}>", row.Name);
writer.Write(CleanXml(rowXml, endPrefix)); // pass StringBuilder for netcoreapp3.0 or above
ProcessFormulas( rowXml, newRowIndex );
writer.Write(CleanXml( rowXml, endPrefix)); // pass StringBuilder for netcoreapp3.0 or above
//mergecells
if (rowInfo.RowMercells != null)
@ -810,6 +794,59 @@ namespace MiniExcelLibs.OpenXml
writer.Write(contents[1]);
}
}
private void ProcessFormulas( StringBuilder rowXml, int rowIndex )
{
var rowXmlString = rowXml.ToString();
// exit early if possible
if ( !rowXmlString.Contains( "$=" ) ) {
return;
}
XmlReaderSettings settings = new XmlReaderSettings { NameTable = _ns.NameTable };
XmlParserContext context = new XmlParserContext( null, _ns, "", XmlSpace.Default );
XmlReader reader = XmlReader.Create( new StringReader( rowXmlString ), settings, context );
XmlDocument d = new XmlDocument();
d.Load( reader );
var row = d.FirstChild as XmlElement;
// convert cells starting with '$=' into formulas
var cs = row.SelectNodes( $"x:c", _ns );
for ( var ci = 0; ci < cs.Count; ci++ )
{
var c = cs.Item( ci ) as XmlElement;
if ( c == null ) {
continue;
}
/* Target:
<c r="C8" s="3">
<f>SUM(C2:C7)</f>
</c>
*/
var vs = c.SelectNodes( $"x:v", _ns );
foreach ( XmlElement v in vs )
{
if ( !v.InnerText.StartsWith( "$=" ) )
{
continue;
}
var fNode = c.OwnerDocument.CreateElement( "f", Config.SpreadsheetmlXmlns );
fNode.InnerText = v.InnerText.Substring( 2 );
c.InsertBefore( fNode, v );
c.RemoveChild( v );
var celRef = ExcelOpenXmlUtils.ConvertXyToCell( ci + 1, rowIndex );
CalcChainCellRefs.Add( celRef );
}
}
rowXml.Clear();
rowXml.Append( row.OuterXml );
}
private static string ConvertToDateTimeString(KeyValuePair<string, PropInfo> propInfo, object cellValue)
{

View File

@ -25,7 +25,8 @@ namespace MiniExcelLibs.OpenXml
_isExpressionRegex = new Regex("(?<={{).*?(?=}})");
_ns = new XmlNamespaceManager(new NameTable());
_ns.AddNamespace("x", Config.SpreadsheetmlXmlns);
}
_ns.AddNamespace( "x14ac", Config.SpreadsheetmlXml_x14ac );
}
private readonly Stream _stream;
private readonly OpenXmlConfiguration _configuration;
@ -118,10 +119,7 @@ namespace MiniExcelLibs.OpenXml
using (var filledStream = entry.Open())
{
sheetIdx++;
var filledDoc = new XmlDocument();
filledDoc.Load(filledStream);
var filledSheetData = filledDoc.SelectSingleNode("/x:worksheet/x:sheetData", _ns);
_calcChainContent.Append(CalcChainHelper.GetCalcChainContentFromSheet(filledSheetData, _ns, sheetIdx));
_calcChainContent.Append( CalcChainHelper.GetCalcChainContent( CalcChainCellRefs, sheetIdx ) );
}
}

View File

@ -1,6 +1,6 @@
using System.IO;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Xml;
namespace MiniExcelLibs.Utils
{
@ -13,22 +13,18 @@ namespace MiniExcelLibs.Utils
// Each <c> element should have a r attribute that specifies the cell's address (e.g., "A1" or "B2").
// The <c> element should also have a i attribute that specifies the index of the formula in the formulas collection (in the workbook's sheet data file).
// https://learn.microsoft.com/en-us/dotnet/api/documentformat.openxml.spreadsheet.calculationchain?view=openxml-2.8.1
public static string GetCalcChainContentFromSheet(in XmlNode sheetData, XmlNamespaceManager ns, int sheetIndex)
{
public static string GetCalcChainContent( List<string> cellRefs, int sheetIndex ) {
StringBuilder calcChainContent = new StringBuilder();
StringBuilder calcChainContent = new StringBuilder();
// each c having f nodes
var cs = sheetData.SelectNodes($"x:row/x:c[./x:f]", ns);
foreach (XmlElement c in cs)
{
calcChainContent.Append($@"<c r=""{c.GetAttribute("r")}"" i=""{sheetIndex}""/>");
}
foreach ( string cr in cellRefs ) {
calcChainContent.Append( $@"<c r=""{cr}"" i=""{sheetIndex}""/>" );
}
return calcChainContent.ToString();
}
return calcChainContent.ToString();
}
public static void GenerateCalcChainSheet(Stream calcChainStream, string calcChainContent)
public static void GenerateCalcChainSheet(Stream calcChainStream, string calcChainContent)
{
using (var writer = new StreamWriter(calcChainStream, Encoding.UTF8))
{

View File

@ -1,4 +1,4 @@
using Dapper;
using Dapper;
using MiniExcelLibs.Attributes;
using MiniExcelLibs.Csv;
using MiniExcelLibs.Exceptions;
@ -3641,5 +3641,56 @@ MyProperty4,MyProperty1,MyProperty5,MyProperty2,MyProperty6,,MyProperty3
Assert.Equal(2, getRowsInfo.Length );
}
[Fact]
public void Issue606_1()
{
// excel max rows: 1,048,576
// before changes: 1999999 => 25.8 GB mem
// after changes: 1999999 => peaks at 3.2 GB mem (10:20 min)
// after changes: 100000 => peaks at 222 MB mem (34 sec)
var value = new
{
Title = "My Title",
OrderInfo = Enumerable
.Range( 1, 100 )
.Select( x => new
{
Standard = "standard",
RegionName = "region",
DealerName = "department",
SalesPointName = "region",
CustomerName = "customer",
IdentityType = "aaaaaa",
IdentitySeries = "ssssss",
IdentityNumber = "nnnnn",
BirthDate = "date",
TariffPlanName = "plan",
PhoneNumber = "num",
SimCardIcc = "sss",
BisContractNumber = "eee",
CreatedAt = "dd.mm.yyyy",
UserDescription = "fhtyrhthrthrt",
UserName = "dfsfsdfds",
PaymentsAmount = "dfhgdfgadfgdfg",
OrderState = "agafgdafgadfgd",
})
};
var path = Path.Combine
(
Path.GetTempPath(),
string.Concat( nameof( MiniExcelIssueTests ), "_", nameof( Issue606_1 ), ".xlsx" )
);
var templateFileName = @"../../../../../samples/xlsx/TestIssue606_Template.xlsx";
MiniExcel.SaveAsByTemplate( path, Path.GetFullPath( templateFileName ), value );
}
}
}