From 92170f1fc2a684a4926f7b59af1bfa284e49a964 Mon Sep 17 00:00:00 2001 From: meld-cp <18450687+meld-cp@users.noreply.github.com> Date: Sun, 21 Jul 2024 00:27:13 +1200 Subject: [PATCH] Reduce memory requirements when processing templates + template formulas (#638) * fix for issue 606 * fix formatting --- samples/xlsx/TestIssue606_Template.xlsx | Bin 0 -> 9426 bytes src/MiniExcel/OpenXml/Config.cs | 1 + .../OpenXml/ExcelOpenXmlTemplate.Impl.cs | 91 ++++++++++++------ src/MiniExcel/OpenXml/ExcelOpenXmlTemplate.cs | 8 +- src/MiniExcel/Utils/calChainHelper.cs | 24 ++--- tests/MiniExcelTests/MiniExcelIssueTests.cs | 53 +++++++++- 6 files changed, 130 insertions(+), 47 deletions(-) create mode 100644 samples/xlsx/TestIssue606_Template.xlsx diff --git a/samples/xlsx/TestIssue606_Template.xlsx b/samples/xlsx/TestIssue606_Template.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..bc3c78710e721020da4693fe727f3c6cec846a45 GIT binary patch literal 9426 zcmeHtg);YJxXUEC1&81|xI+j|Ah;*NoghJj+Yp>!!96%3I0X33 z=I(yGo89j(xVPsyr=EVM-Zi&`$RGeJ0384T&;ZO1v#pH~0DyR80DusH zj%Xn5U+Z-7$oh-?L_2}T!ud5(vzYaAWwLn8WG)Ul(3 zVI5jVWCEPM{re2<4@eSR_4SV4g2*JeNsNtSpJxKnTpGI7`D9ilmC8x2iHOC5*0bmO zwegqxny^dti-lnGohuOSC49k8C7*%ovNVezfFizCaZEo}mns(hq1B~hoP`hD?s;C8=fem?EYucA+% zUN=7?vNmQn$dL%937MD$Jhh9iBd*|a9QE%JC(2V9{MJRic*C$tNZzwdX3?Az7jFT1 zNbA71QqZ3x^F&C1H6)-2Hqa>8@#Mgq+|RLa9Bl(x_VONdj=$4<(jf&L=%AtZ4?kFe z?TGm9k8K*fI6O|-3FNs$1pw~vkpNHsM#~RcTy&>!TT_9v4hv38V^>RiC@05n^Z)4h zU(CV3+XU9R?64z+AK|czsl|Apl(MI!LJN&%K#<}B-iO#cda}iKCUT%A=?f(J zz}A2tpXL`uV|QUR=WBfB(F7n7x_Zxwu+(c8cQjT;*AzLI^3@)EkEye%^K^M7Z)T6y zIM%Yp;&;je%XD&|k7cTXqZ~S<7=#7nVZ@KqgZ2AV^_I*oD-ouoG!81l-Zu!m-Ax?L z44g?R+`$o#5K`HnNGF0p&8)su`oZjJ&aOzcG^~Ye-kasP3e$QUTiAD;$Yiz?-ukga zvS8}8d<2&~qw@U>Z_fht>-dfbv%Px+FuKcN-v&aWsfw525$c~L$;fNxjz9tc`rx4q z5B_Al>^VK1T(UN%mIwUpd-m#7@&{}Te@MY&x-n7!&bqn9Bn|8R#b zE^1-fu9b+8vFk&LgZ^~tj~yW@x-=5D2D{7PGGgBZ9$+;OQ~Puh;j9QX4LX`W`ORg0 za}TYSZBFcDWIOgix2qhMu!C6wJ2)$qq$EXEn}6^DW>2{5JsxRKwMsCqspn+#g_8yV zyJ>lEU)>pHX|x*)b|&%zkRE=`I3MecTOvEHiVvk)^g~w}Jl_w9vcW(w0z!uV}=C6oMqL4Y@=7alcM&?-{&&Z$Li?f>y5xwuOsNWR)X6 z3!ZEa3OZ7*30_ADLjfJ2%Vxxeg%KlHRtT9|2qA$Wg^*Jf?XY&AP411IgBj##((|f? z=7ZT(hq~-V)yn2b2ZRfc0&zUcr3T~s%yp3nF*OErDnRku7 zNnWN*S<3&|FU*ZP941c{$dUlv%uR36I$Cv;y7|P?KL4F<@bu{U#z6nnGJK`~X)g5Mq2vWK6wgy@i zo?0z&cKw2pS~0xye36B>7ZhaJ$+TMeT7xdJqfoIWbL|EuHMA-f8;(qZRW3OMyW(=M zTJZQN)r9~GW*v>HtnHWg1lj3w4SS<+wB<{TPII^k0q@2c=E$JBa<2=086$8vgZaNb z2Padb-f6WcP=V-@p#<(pJuZ|OU1XE^DYlr>jtyBgB8mo`q@N{0Ws$)@&T^>giZFyx zm?C^^@B}oYkf8wUeTj zHylabdTmy{U&@GEWi|J+nP{O;@wTD_0|9(B)h#vpHdk)Z_Y{dcyVbq*19zcwLPjkVzwSR z%0c-Qh4T%MZx_X7?-#XpJ+v_U9KGRU(|CHh;OwK=B+l~{B>Ub+44xeXXK#Wd`-{)Z zM^_wzd_6JV21Pk28e}@v+zo?k!zC2chSy0TJR8Ph1M8mCM4_gs^>Yf``x0~q^c-a; z6rVwr01M&}fk|0Q9HLHM;KGvSLsQpX+npDyT&gwRtGPDNdMM6JOY{oit|A3eSO<3p zrLWv0#sxa+F%k6mC7O=&h#t%t*}!G~1H;bqN*3I&f<{AIV`+-$f?soAoQ_e~7vzCWby*8r$ zqSZPNzSf%U@6K}$jv3zgyS<6L|7q4gr}C6}#}EmSY~afww1~G}HbMEHC^pW~LKEh~ zZRsZK1DD@1b7h`P^gNJG0i)Gim}*wVX-(hm;AOGq4d|6j>|;AQ?d|NeA0wCzP9Bu-RY!u@1-1T)ZHg*o0{iW%r2i~|X_=QYC&&Gf`H?hH^%qA@h}zoHf0PIH3) z*1bCRAjXrnJy1Y14jFFjAqEQR2Xy#=OLw?*hs!g#Ji|b7#ehp343wo*>ryuAMWlBU z&D`*T0Us>zK>{B_@340;?S^!2CG8d!VcJ5 zLFaGiJePWS;@R28PI`8*mlyIZ z)eH%MVF-ti>Bf{lO{y-?O$WXZdz10KeFvCW*dxZqi zT(60Qtm+CMi_^@W4$Fb>UPyA~=#)+8npz~H(+n=oCDe_5gay}$W%ik`F%yVaa7)S! zizdpHFv1Cki}Bt11q$!#FOB-P$kBquV2noz#Uo<41|lFOlGFX-M^cFT7t9DOQ3}!$ zRsOgbcD$)Q7C}8~=6)f_a_5X}#7JygbN(8og;5TyLHY(nf)L7aDJn>;wGR(%OkXoZ zPCRpWckQE1!39I&7YeR0AvI44ot>9k%>tqn*)|{Ui|eGjYz0z7r@}}gZ5Ff7ldg2S zA2&=#SLWkB&W_sRbK`rpb);s;PAf9k(O3bB@x1^CKEhaS2EMl4WTMXarmHz7l2Ftp zuZwvRsel1pG0tU6j^7P|4bhX!_7_XO2D6H$JOCSqFMA-Z7t_Gl1(78$-)|68gQ;T4 zL5u7>ssyAgE&C}bMKp*KtFhYHchQ5!xYOW41LliI(MPRd4{!j1b@eyqF^m{!iOFP| zeP7f2(DJ?Z>`u+W!;V=dTpr`L9;@u3-(X#bOX}{d^fBPnV6u6b+&06- zIPAmo8ST_LG9&bIm7(*EifsHB>0Ub<;qEPlWoRM9&L? z0yy^rxqL1z2}D#w4tUaI-i@)pOn>RGWiNz3)eHMHxl?oBDqoZBwQa$uilt#qX||(q zJyHZsO!{ai1f`6yEe5e*R{CuK)nExxs!?}Mql+Z2q_MJ*=^%Pp--)-LUyPj+(1 z<_U5sf`F<~_4+edqjpj3tBp+w{PwP%?1|MrkQQ$ydZ72qI-SEH_bNZ3g zsP*Bg)xg{k>DUXp=CBSxyY2w~B`R;T_Njf6m0(VYPt*_!SJAmUrRGZ^%bSt3RqKXn zH=Nm5JRaW`dm1s2^$;a1BMF4uvfBLHt2$#4zgNGgJ~2vbRr8_>J-)&s9k901wRh}$ z_quzu0l)STt(aL+`4dQ$E?SJ%eyuB^mTGogzm&js82Ao<#WR=9G=Bmp5an3ERc48E zZ&$*k~K5MRfPy4K>0v8ifodM`9(xcE8X{`LRL@s@c?~Dl+3`Y4LJYZg4 z*--bP*TnPIPEH^s5n`Cq!SdNXyF&96JjNAnjIf?B(mA(VH~vHo(>RWxIACI}orVGZ z)r{)`Sa;wm*V+Hc9L-RxG9nhiolxTKr`fyJ$=kCXA_{X7@<)|6qxzi-kpUdb4y~b> zLag0!QQzKW32(J!4rcgwZu9Wj5_6jcjCsI`w{STstn^G=&J3mijy#i97pQ)QV7 zRm08x-hq?PBspN3>6-e(ONH_3+s=iIkdmfA$rXR=l_(@*t0L>DyB5Q?>?+5TvFxkC zbTU|`0Pn>s+`-6|R2*sWASaODen>_wn<1U`%!upxg{RjXQ!8{e$GdX#CzyZ)7PWW zbBTbXZ-`|WuzJXO$*`nX4D_(Pne;eJ_Z@kBG2-BIo4icRnHF!#A6X!;KG6@0c3OMdxwSWu2M_*+TW z8?%`tht6VfW(-Ta<&v?Ux5@S7I;u+uu1-qzHH@0#@bMAX&9Y<(bE~>qvQKMsk$(9y zXGW~}bizx7ZZF_b!J4rN@-j^*I%CN!e$JatX3Bh8 zW11*5FU}B+p_?3PMNoS%H@3#ur~*}&(1Z%d>Lp261ofp#U~r^%S|_jzMe+-N%=+!% zO?n3*J5UOQK%*!a52hR%9d^A!pP~~9;g&}hfBj^y$DSl)Od^K-?TYlJ>ekm4M6iSt zuhOf!6|W#_6>%jM(J^AHYbc5wKL4okG~+I$H{8p&+H79gb4y-uOZE~_5W`E+%=%s3 zC6poe3%$udndelEdi3 z>-jGz4AL?Dhy7I;KPCvnp#ouCw|w=XgEB7#LZsNeTWQzRPkm;J9qOaWOcJlt@eW8E zI<2E4{hr~N){uHV+cZCTr5P-)JRLRI_+7JM$>HmE7Oh<)ZQB0Dz_3|S=kTlhn8@;B zSHo}SPl~rtwS@fKSwG6<&E@iX={l^jX1^qR6Ri5uz$HsGb^^DNU{b9%|APw&cq?UU zQO}c}kW59m%MP3s>*pL$R93fVj%N8~#_sMgMiI-|t2WE{eSJd?2M2rA66Ft+!KJw` z<*5$_TbB^mDU6XhkB^k8XXaw*`i_MbW|&qbe}KFXtaI40ao z!G17@;QNK3i{d;Hfk3;rQ%?VA?MFYS%zYgHvFUPMA+T;3YWc+TTCCd@ z<~WtQalwYu;*gXTWZk%iCE?S2Zj12&WCX|FoXD&d(GK2vo}t_>nePMF+UlVJ-Pxf! zO_D}Ly(CVr^AqT=Oww8gULUX#siZjWWsXZA^rIT^fxLvkS4PU)^WQAHVdTsPs;$vZ zbFseL71Xa2uODwK&blv_i%|4OoTUC(9)8nuT+;MNT05xRh!vP$(&;mB(eH&h-~Rwv zX%j;RQBZcYIV^*h$ zW7CkI;UN@AwU8Yd^q+kEsH;OWp9VL5(!#t<; zil2UPjWKz*_H;7Yu0+JBS+6;e#Yv>TJ+ok$KJL7F>W@KurAGe7d6s$>G|_F#Wzo7`-q(;( z2x;(J_4rVMt{YWb${M4s5Xe#$d(P2_oi`X8CiTrl4Y){f+A7ePT)ndXl8;i*4;=b)GJ_YCp_n%j;1?*NqhF-Ff8ZUvsVs1xetZl$sr9y} z^j!87_rOy-De|@r_foNM~h{4Sn@ZoDsNedZmP`xCsolY)B%VWHLYMQvKli z0CUy*^hU9IC^*~QDHjjYF4o_76{q2!I*7y(kFYl5(RsOqzzxTQS0RARZ;Y9jN*ZWuC zuf4xNp&#K5^hXgZL8)0JvcO1pj|J62I#CwJ`msrhM%G_b2{Qrv6pSuT`8s zwY0(u-tb%bwWjl{f?v0de=3M2`l;a0P2{i8U-QpDp@Ag-f&Q9}{;J{car{p_0N_my q0Q@7W{|f)R-~20Ff$A^t-@WKlWfb`80sxrsPY^t<5T*U??Ee5|FJH6( literal 0 HcmV?d00001 diff --git a/src/MiniExcel/OpenXml/Config.cs b/src/MiniExcel/OpenXml/Config.cs index fed5318..fd0102b 100644 --- a/src/MiniExcel/OpenXml/Config.cs +++ b/src/MiniExcel/OpenXml/Config.cs @@ -6,5 +6,6 @@ public const string SpreadsheetmlXmlStrictns = "http://purl.oclc.org/ooxml/spreadsheetml/main"; public const string SpreadsheetmlXmlRelationshipns = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"; public const string SpreadsheetmlXmlStrictRelationshipns = "http://purl.oclc.org/ooxml/officeDocument/relationships"; + public const string SpreadsheetmlXml_x14ac = "http://schemas.microsoft.com/office/spreadsheetml/2009/9/ac"; } } \ No newline at end of file diff --git a/src/MiniExcel/OpenXml/ExcelOpenXmlTemplate.Impl.cs b/src/MiniExcel/OpenXml/ExcelOpenXmlTemplate.Impl.cs index d3a355d..187382a 100644 --- a/src/MiniExcel/OpenXml/ExcelOpenXmlTemplate.Impl.cs +++ b/src/MiniExcel/OpenXml/ExcelOpenXmlTemplate.Impl.cs @@ -105,7 +105,9 @@ namespace MiniExcelLibs.OpenXml } } - private List XRowInfos { get; set; } + private List XRowInfos { get; set; } + + private readonly List CalcChainCellRefs = new List(); private Dictionary XMergeCellInfos { get; set; } public List NewXMergeCellInfos { get; private set; } @@ -688,7 +690,10 @@ namespace MiniExcelLibs.OpenXml var mergeBaseRowIndex = newRowIndex; newRowIndex += rowInfo.IEnumerableMercell?.Height ?? 1; - writer.Write(CleanXml(rowXml, endPrefix)); // pass StringBuilder for netcoreapp3.0 or above + + // replace formulas + ProcessFormulas( rowXml, newRowIndex ); + writer.Write(CleanXml( rowXml, endPrefix)); // pass StringBuilder for netcoreapp3.0 or above //mergecells if (rowInfo.RowMercells != null) @@ -743,30 +748,6 @@ namespace MiniExcelLibs.OpenXml else { - // convert cells starting with '$=' into formulas - var cs = row.SelectNodes($"x:c", _ns); - foreach (XmlElement c in cs) - { - /* Target: - - SUM(C2:C7) - - */ - var vs = c.SelectNodes($"x:v", _ns); - foreach (XmlElement v in vs) - { - if (!v.InnerText.StartsWith("$=")) - { - continue; - } - var fNode = c.OwnerDocument.CreateElement("f", Config.SpreadsheetmlXmlns); - fNode.InnerText = v.InnerText.Substring(2); - c.InsertBefore(fNode, v); - c.RemoveChild(v); - } - } - innerXml = row.InnerXml; - rowXml.Clear() .Append(outerXmlOpen) .AppendFormat(@" r=""{0}"">", newRowIndex) @@ -775,7 +756,10 @@ namespace MiniExcelLibs.OpenXml .Replace($"{{{{$enumrowstart}}}}", enumrowstart.ToString()) .Replace($"{{{{$enumrowend}}}}", enumrowend.ToString()) .AppendFormat("", row.Name); - writer.Write(CleanXml(rowXml, endPrefix)); // pass StringBuilder for netcoreapp3.0 or above + + ProcessFormulas( rowXml, newRowIndex ); + + writer.Write(CleanXml( rowXml, endPrefix)); // pass StringBuilder for netcoreapp3.0 or above //mergecells if (rowInfo.RowMercells != null) @@ -810,6 +794,59 @@ namespace MiniExcelLibs.OpenXml writer.Write(contents[1]); } } + + private void ProcessFormulas( StringBuilder rowXml, int rowIndex ) + { + + var rowXmlString = rowXml.ToString(); + + // exit early if possible + if ( !rowXmlString.Contains( "$=" ) ) { + return; + } + + XmlReaderSettings settings = new XmlReaderSettings { NameTable = _ns.NameTable }; + XmlParserContext context = new XmlParserContext( null, _ns, "", XmlSpace.Default ); + XmlReader reader = XmlReader.Create( new StringReader( rowXmlString ), settings, context ); + + XmlDocument d = new XmlDocument(); + d.Load( reader ); + + var row = d.FirstChild as XmlElement; + + // convert cells starting with '$=' into formulas + var cs = row.SelectNodes( $"x:c", _ns ); + for ( var ci = 0; ci < cs.Count; ci++ ) + { + var c = cs.Item( ci ) as XmlElement; + if ( c == null ) { + continue; + } + /* Target: + + SUM(C2:C7) + + */ + var vs = c.SelectNodes( $"x:v", _ns ); + foreach ( XmlElement v in vs ) + { + if ( !v.InnerText.StartsWith( "$=" ) ) + { + continue; + } + var fNode = c.OwnerDocument.CreateElement( "f", Config.SpreadsheetmlXmlns ); + fNode.InnerText = v.InnerText.Substring( 2 ); + c.InsertBefore( fNode, v ); + c.RemoveChild( v ); + + var celRef = ExcelOpenXmlUtils.ConvertXyToCell( ci + 1, rowIndex ); + CalcChainCellRefs.Add( celRef ); + + } + } + rowXml.Clear(); + rowXml.Append( row.OuterXml ); + } private static string ConvertToDateTimeString(KeyValuePair propInfo, object cellValue) { diff --git a/src/MiniExcel/OpenXml/ExcelOpenXmlTemplate.cs b/src/MiniExcel/OpenXml/ExcelOpenXmlTemplate.cs index c2a38ea..d290695 100644 --- a/src/MiniExcel/OpenXml/ExcelOpenXmlTemplate.cs +++ b/src/MiniExcel/OpenXml/ExcelOpenXmlTemplate.cs @@ -25,7 +25,8 @@ namespace MiniExcelLibs.OpenXml _isExpressionRegex = new Regex("(?<={{).*?(?=}})"); _ns = new XmlNamespaceManager(new NameTable()); _ns.AddNamespace("x", Config.SpreadsheetmlXmlns); - } + _ns.AddNamespace( "x14ac", Config.SpreadsheetmlXml_x14ac ); + } private readonly Stream _stream; private readonly OpenXmlConfiguration _configuration; @@ -118,10 +119,7 @@ namespace MiniExcelLibs.OpenXml using (var filledStream = entry.Open()) { sheetIdx++; - var filledDoc = new XmlDocument(); - filledDoc.Load(filledStream); - var filledSheetData = filledDoc.SelectSingleNode("/x:worksheet/x:sheetData", _ns); - _calcChainContent.Append(CalcChainHelper.GetCalcChainContentFromSheet(filledSheetData, _ns, sheetIdx)); + _calcChainContent.Append( CalcChainHelper.GetCalcChainContent( CalcChainCellRefs, sheetIdx ) ); } } diff --git a/src/MiniExcel/Utils/calChainHelper.cs b/src/MiniExcel/Utils/calChainHelper.cs index 6747c7a..1d40c30 100644 --- a/src/MiniExcel/Utils/calChainHelper.cs +++ b/src/MiniExcel/Utils/calChainHelper.cs @@ -1,6 +1,6 @@ -using System.IO; +using System.Collections.Generic; +using System.IO; using System.Text; -using System.Xml; namespace MiniExcelLibs.Utils { @@ -13,22 +13,18 @@ namespace MiniExcelLibs.Utils // Each element should have a r attribute that specifies the cell's address (e.g., "A1" or "B2"). // The element should also have a i attribute that specifies the index of the formula in the formulas collection (in the workbook's sheet data file). // https://learn.microsoft.com/en-us/dotnet/api/documentformat.openxml.spreadsheet.calculationchain?view=openxml-2.8.1 - public static string GetCalcChainContentFromSheet(in XmlNode sheetData, XmlNamespaceManager ns, int sheetIndex) - { + public static string GetCalcChainContent( List cellRefs, int sheetIndex ) { - StringBuilder calcChainContent = new StringBuilder(); + StringBuilder calcChainContent = new StringBuilder(); - // each c having f nodes - var cs = sheetData.SelectNodes($"x:row/x:c[./x:f]", ns); - foreach (XmlElement c in cs) - { - calcChainContent.Append($@""); - } + foreach ( string cr in cellRefs ) { + calcChainContent.Append( $@"" ); + } - return calcChainContent.ToString(); - } + return calcChainContent.ToString(); + } - public static void GenerateCalcChainSheet(Stream calcChainStream, string calcChainContent) + public static void GenerateCalcChainSheet(Stream calcChainStream, string calcChainContent) { using (var writer = new StreamWriter(calcChainStream, Encoding.UTF8)) { diff --git a/tests/MiniExcelTests/MiniExcelIssueTests.cs b/tests/MiniExcelTests/MiniExcelIssueTests.cs index 8d16a7b..e849293 100644 --- a/tests/MiniExcelTests/MiniExcelIssueTests.cs +++ b/tests/MiniExcelTests/MiniExcelIssueTests.cs @@ -1,4 +1,4 @@ -using Dapper; +using Dapper; using MiniExcelLibs.Attributes; using MiniExcelLibs.Csv; using MiniExcelLibs.Exceptions; @@ -3641,5 +3641,56 @@ MyProperty4,MyProperty1,MyProperty5,MyProperty2,MyProperty6,,MyProperty3 Assert.Equal(2, getRowsInfo.Length ); } + + + [Fact] + public void Issue606_1() + { + // excel max rows: 1,048,576 + // before changes: 1999999 => 25.8 GB mem + // after changes: 1999999 => peaks at 3.2 GB mem (10:20 min) + // after changes: 100000 => peaks at 222 MB mem (34 sec) + + var value = new + { + Title = "My Title", + OrderInfo = Enumerable + .Range( 1, 100 ) + .Select( x => new + { + Standard = "standard", + RegionName = "region", + DealerName = "department", + SalesPointName = "region", + CustomerName = "customer", + IdentityType = "aaaaaa", + IdentitySeries = "ssssss", + IdentityNumber = "nnnnn", + BirthDate = "date", + TariffPlanName = "plan", + PhoneNumber = "num", + SimCardIcc = "sss", + BisContractNumber = "eee", + CreatedAt = "dd.mm.yyyy", + UserDescription = "fhtyrhthrthrt", + UserName = "dfsfsdfds", + PaymentsAmount = "dfhgdfgadfgdfg", + OrderState = "agafgdafgadfgd", + }) + }; + + var path = Path.Combine + ( + Path.GetTempPath(), + string.Concat( nameof( MiniExcelIssueTests ), "_", nameof( Issue606_1 ), ".xlsx" ) + ); + + var templateFileName = @"../../../../../samples/xlsx/TestIssue606_Template.xlsx"; + + + MiniExcel.SaveAsByTemplate( path, Path.GetFullPath( templateFileName ), value ); + + } + } } \ No newline at end of file