Skip to content

Commit 87667dd

Browse files
committed
1 parent d2f252a commit 87667dd

File tree

10 files changed

+494
-0
lines changed

10 files changed

+494
-0
lines changed
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
using System.Collections.Generic;
2+
using Platform.Numbers;
3+
using Platform.Data.Numbers.Raw;
4+
using Platform.Data.Doublets;
5+
using Platform.Data.Doublets.Sequences.Converters;
6+
using Platform.Data.Doublets.Sequences.Frequencies.Cache;
7+
using Platform.Data.Doublets.Sequences.Indexes;
8+
using Platform.Data.Doublets.Unicode;
9+
10+
#pragma warning disable CS1591 // Missing XML comment for publicly visible type or member
11+
12+
namespace Platform.Data.Doublets.Xml
13+
{
14+
public class DefaultXmlStorage<TLink> : IXmlStorage<TLink>
15+
{
16+
private static readonly TLink _zero = default;
17+
private static readonly TLink _one = Arithmetic.Increment(_zero);
18+
19+
private readonly StringToUnicodeSequenceConverter<TLink> _stringToUnicodeSequenceConverter;
20+
private readonly ILinks<TLink> _links;
21+
private TLink _unicodeSymbolMarker;
22+
private TLink _unicodeSequenceMarker;
23+
private TLink _elementMarker;
24+
private TLink _textElementMarker;
25+
private TLink _documentMarker;
26+
27+
private class Unindex : ISequenceIndex<TLink>
28+
{
29+
public bool Add(IList<TLink> sequence) => true;
30+
public bool MightContain(IList<TLink> sequence) => true;
31+
}
32+
33+
public DefaultXmlStorage(ILinks<TLink> links, bool indexSequenceBeforeCreation, LinkFrequenciesCache<TLink> frequenciesCache)
34+
{
35+
var linkToItsFrequencyNumberConverter = new FrequenciesCacheBasedLinkToItsFrequencyNumberConverter<TLink>(frequenciesCache);
36+
var sequenceToItsLocalElementLevelsConverter = new SequenceToItsLocalElementLevelsConverter<TLink>(links, linkToItsFrequencyNumberConverter);
37+
var optimalVariantConverter = new OptimalVariantConverter<TLink>(links, sequenceToItsLocalElementLevelsConverter);
38+
InitConstants(links);
39+
var charToUnicodeSymbolConverter = new CharToUnicodeSymbolConverter<TLink>(links, new AddressToRawNumberConverter<TLink>(), _unicodeSymbolMarker);
40+
var index = indexSequenceBeforeCreation ? new CachedFrequencyIncrementingSequenceIndex<TLink>(frequenciesCache) : (ISequenceIndex<TLink>)new Unindex();
41+
_stringToUnicodeSequenceConverter = new StringToUnicodeSequenceConverter<TLink>(links, charToUnicodeSymbolConverter, index, optimalVariantConverter, _unicodeSequenceMarker);
42+
_links = links;
43+
}
44+
45+
private void InitConstants(ILinks<TLink> links)
46+
{
47+
var markerIndex = _one;
48+
var meaningRoot = links.GetOrCreate(markerIndex, markerIndex);
49+
_unicodeSymbolMarker = links.GetOrCreate(meaningRoot, Arithmetic.Increment(markerIndex));
50+
_unicodeSequenceMarker = links.GetOrCreate(meaningRoot, Arithmetic.Increment(markerIndex));
51+
_elementMarker = links.GetOrCreate(meaningRoot, Arithmetic.Increment(markerIndex));
52+
_textElementMarker = links.GetOrCreate(meaningRoot, Arithmetic.Increment(markerIndex));
53+
_documentMarker = links.GetOrCreate(meaningRoot, Arithmetic.Increment(markerIndex));
54+
}
55+
56+
public TLink CreateDocument(string name) => Create(_documentMarker, name);
57+
58+
public TLink CreateElement(string name) => Create(_elementMarker, name);
59+
60+
public TLink CreateTextElement(string content) => Create(_textElementMarker, content);
61+
62+
private TLink Create(TLink marker, string content)
63+
{
64+
var contentSequence = _stringToUnicodeSequenceConverter.Convert(content);
65+
return _links.GetOrCreate(marker, contentSequence);
66+
}
67+
68+
public void AttachElementToParent(TLink elementToAttach, TLink parent) => _links.GetOrCreate(parent, elementToAttach);
69+
}
70+
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#pragma warning disable CS1591 // Missing XML comment for publicly visible type or member
2+
3+
namespace Platform.Data.Doublets.Xml
4+
{
5+
public interface ICommandLineInterface
6+
{
7+
void Run(params string[] args);
8+
}
9+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#pragma warning disable CS1591 // Missing XML comment for publicly visible type or member
2+
3+
namespace Platform.Data.Doublets.Xml
4+
{
5+
public interface IXmlStorage<TLink>
6+
{
7+
TLink CreateDocument(string name);
8+
TLink CreateElement(string name);
9+
TLink CreateTextElement(string content);
10+
void AttachElementToParent(TLink elementToAttach, TLink parent);
11+
}
12+
}

csharp/Platform.Data.Doublets.Xml/Platform.Data.Doublets.Xml.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333

3434
<ItemGroup>
3535
<PackageReference Include="Microsoft.SourceLink.GitHub" Version="1.0.0" PrivateAssets="All" />
36+
<PackageReference Include="Platform.Data.Doublets" Version="0.6.7" />
3637
</ItemGroup>
3738

3839
</Project>
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#pragma warning disable CS1591 // Missing XML comment for publicly visible type or member
2+
3+
using System.Collections.Generic;
4+
5+
namespace Platform.Data.Doublets.Xml
6+
{
7+
internal class XmlElementContext
8+
{
9+
public readonly Dictionary<string, int> ChildrenNamesCounts;
10+
public int TotalChildren;
11+
12+
public XmlElementContext() => ChildrenNamesCounts = new Dictionary<string, int>();
13+
14+
public void IncrementChildNameCount(string name)
15+
{
16+
if (ChildrenNamesCounts.TryGetValue(name, out int count))
17+
{
18+
ChildrenNamesCounts[name] = count + 1;
19+
}
20+
else
21+
{
22+
ChildrenNamesCounts[name] = 0;
23+
}
24+
TotalChildren++;
25+
}
26+
}
27+
}
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Threading;
4+
using System.Threading.Tasks;
5+
using System.Xml;
6+
using System.Linq;
7+
using Platform.Exceptions;
8+
using Platform.IO;
9+
10+
#pragma warning disable CS1591 // Missing XML comment for publicly visible type or member
11+
12+
namespace Platform.Data.Doublets.Xml
13+
{
14+
public class XmlElementCounter
15+
{
16+
public XmlElementCounter() { }
17+
18+
public Task Count(string file, string elementName, CancellationToken token)
19+
{
20+
return Task.Factory.StartNew(() =>
21+
{
22+
try
23+
{
24+
var context = new RootElementContext();
25+
using (var reader = XmlReader.Create(file))
26+
{
27+
Count(reader, elementName, token, context);
28+
}
29+
Console.WriteLine($"Total elements with specified name: {context.TotalElements}, total content length: {context.TotalContentsLength}.");
30+
}
31+
catch (Exception ex)
32+
{
33+
Console.WriteLine(ex.ToStringWithAllInnerExceptions());
34+
}
35+
}, token);
36+
}
37+
38+
private void Count(XmlReader reader, string elementNameToCount, CancellationToken token, XmlElementContext context)
39+
{
40+
var rootContext = (RootElementContext)context;
41+
var parentContexts = new Stack<XmlElementContext>();
42+
var elements = new Stack<string>(); // Path
43+
// TODO: If path was loaded previously, skip it.
44+
while (reader.Read())
45+
{
46+
if (token.IsCancellationRequested)
47+
{
48+
return;
49+
}
50+
switch (reader.NodeType)
51+
{
52+
case XmlNodeType.Element:
53+
var elementName = reader.Name;
54+
context.IncrementChildNameCount(elementName);
55+
elementName = $"{elementName}[{context.ChildrenNamesCounts[elementName]}]";
56+
if (!reader.IsEmptyElement)
57+
{
58+
elements.Push(elementName);
59+
ConsoleHelpers.Debug("{0} starting...", elements.Count <= 20 ? ToXPath(elements) : elementName); // XPath
60+
parentContexts.Push(context);
61+
context = new XmlElementContext();
62+
}
63+
else
64+
{
65+
ConsoleHelpers.Debug("{0} finished.", elementName);
66+
}
67+
break;
68+
69+
case XmlNodeType.EndElement:
70+
ConsoleHelpers.Debug("{0} finished.", elements.Count <= 20 ? ToXPath(elements) : elements.Peek()); // XPath
71+
var topElement = elements.Pop();
72+
// Restoring scope
73+
context = parentContexts.Pop();
74+
if (topElement.StartsWith(elementNameToCount))
75+
{
76+
rootContext.TotalElements++;
77+
// TODO: Check for 0x00 part/symbol at 198102797 line and 13 position.
78+
//if (rootContext.TotalPages > 3490000)
79+
// selfCancel = true;
80+
if (context.ChildrenNamesCounts[elementNameToCount] % 10000 == 0)
81+
{
82+
Console.WriteLine(topElement);
83+
}
84+
}
85+
break;
86+
87+
case XmlNodeType.Text:
88+
ConsoleHelpers.Debug("Starting text element...");
89+
var content = reader.Value;
90+
rootContext.TotalContentsLength += (ulong)content.Length;
91+
ConsoleHelpers.Debug($"Content length is: {content.Length}");
92+
ConsoleHelpers.Debug("Text element finished.");
93+
break;
94+
}
95+
}
96+
}
97+
98+
private string ToXPath(Stack<string> path) => string.Join("/", path.Reverse());
99+
100+
private class RootElementContext : XmlElementContext
101+
{
102+
public ulong TotalElements;
103+
public ulong TotalContentsLength;
104+
}
105+
}
106+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
using System;
2+
using System.IO;
3+
using Platform.IO;
4+
5+
#pragma warning disable CS1591 // Missing XML comment for publicly visible type or member
6+
7+
namespace Platform.Data.Doublets.Xml
8+
{
9+
public class XmlElementCounterCLI : ICommandLineInterface
10+
{
11+
public void Run(params string[] args)
12+
{
13+
var file = ConsoleHelpers.GetOrReadArgument(0, "Xml file", args);
14+
var elementName = ConsoleHelpers.GetOrReadArgument(1, "Element name to count", args);
15+
if (!File.Exists(file))
16+
{
17+
Console.WriteLine("Entered xml file does not exists.");
18+
}
19+
else if (string.IsNullOrEmpty(elementName))
20+
{
21+
Console.WriteLine("Entered element name is empty.");
22+
}
23+
else
24+
{
25+
using (var cancellation = new ConsoleCancellation())
26+
{
27+
Console.WriteLine("Press CTRL+C to stop.");
28+
new XmlElementCounter().Count(file, elementName, cancellation.Token).Wait();
29+
}
30+
}
31+
}
32+
}
33+
}
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
using System;
2+
using System.Linq;
3+
using System.Collections.Generic;
4+
using System.Threading;
5+
using System.Threading.Tasks;
6+
using System.Xml;
7+
using Platform.Exceptions;
8+
using Platform.Collections;
9+
using Platform.IO;
10+
11+
#pragma warning disable CS1591 // Missing XML comment for publicly visible type or member
12+
13+
namespace Platform.Data.Doublets.Xml {
14+
public class XmlImporter<TLink>
15+
{
16+
private readonly IXmlStorage<TLink> _storage;
17+
18+
public XmlImporter(IXmlStorage<TLink> storage) => _storage = storage;
19+
20+
public Task Import(string file, CancellationToken token)
21+
{
22+
return Task.Factory.StartNew(() =>
23+
{
24+
try
25+
{
26+
var document = _storage.CreateDocument(file);
27+
28+
using (var reader = XmlReader.Create(file))
29+
{
30+
Read(reader, token, new ElementContext(document));
31+
}
32+
}
33+
catch (Exception ex)
34+
{
35+
Console.WriteLine(ex.ToStringWithAllInnerExceptions());
36+
}
37+
38+
}, token);
39+
}
40+
41+
private void Read(XmlReader reader, CancellationToken token, ElementContext context)
42+
{
43+
var parentContexts = new Stack<ElementContext>();
44+
var elements = new Stack<string>(); // Path
45+
// TODO: If path was loaded previously, skip it.
46+
while (reader.Read())
47+
{
48+
if (token.IsCancellationRequested)
49+
{
50+
return;
51+
}
52+
switch (reader.NodeType)
53+
{
54+
case XmlNodeType.Element:
55+
var elementName = reader.Name;
56+
context.IncrementChildNameCount(elementName);
57+
elementName = $"{elementName}[{context.ChildrenNamesCounts[elementName]}]";
58+
if (!reader.IsEmptyElement)
59+
{
60+
elements.Push(elementName);
61+
ConsoleHelpers.Debug("{0} starting...", elements.Count <= 20 ? ToXPath(elements) : elementName); // XPath
62+
var element = _storage.CreateElement(name: elementName);
63+
parentContexts.Push(context);
64+
_storage.AttachElementToParent(elementToAttach: element, parent: context.Parent);
65+
context = new ElementContext(element);
66+
}
67+
else
68+
{
69+
ConsoleHelpers.Debug("{0} finished.", elementName);
70+
}
71+
break;
72+
case XmlNodeType.EndElement:
73+
ConsoleHelpers.Debug("{0} finished.", elements.Count <= 20 ? ToXPath(elements) : elements.Peek()); // XPath
74+
elements.Pop();
75+
// Restoring scope
76+
context = parentContexts.Pop();
77+
if (elements.Count == 1)
78+
{
79+
if (context.TotalChildren % 10 == 0)
80+
Console.WriteLine(context.TotalChildren);
81+
}
82+
break;
83+
case XmlNodeType.Text:
84+
ConsoleHelpers.Debug("Starting text element...");
85+
var content = reader.Value;
86+
ConsoleHelpers.Debug("Content: {0}{1}", content.Truncate(50), content.Length >= 50 ? "..." : "");
87+
var textElement = _storage.CreateTextElement(content: content);
88+
_storage.AttachElementToParent(textElement, context.Parent);
89+
ConsoleHelpers.Debug("Text element finished.");
90+
break;
91+
}
92+
}
93+
}
94+
95+
private string ToXPath(Stack<string> path) => string.Join("/", path.Reverse());
96+
97+
private class ElementContext : XmlElementContext
98+
{
99+
public readonly TLink Parent;
100+
101+
public ElementContext(TLink parent)
102+
{
103+
Parent = parent;
104+
}
105+
}
106+
}
107+
}

0 commit comments

Comments
 (0)