Skip to content

Commit 4e299a6

Browse files
committed
BPE
1 parent aa7166e commit 4e299a6

35 files changed

+168057
-66
lines changed

.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,4 +49,8 @@ perftest/other/
4949
perftest/test.py
5050
perftest/.*/
5151
docs/blackpaper/
52-
extract.py
52+
extract.py
53+
dataset/BPE
54+
dataset/txt
55+
perftest/performance_test_splade.py
56+
*.h5

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ elseif (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
3333
-fstrict-aliasing -finline-functions
3434
-march=native -mtune=native)
3535
elseif (MSVC)
36-
set(COMPILE_OPTIONS /O2 /W1 /GL /Ot /Ob3 /fp:fast /arch:AVX2 /Zc:__cplusplus /EHsc- /GR-)
36+
set(COMPILE_OPTIONS /O2 /W1 /GL /Ot /Ob3 /fp:fast /arch:AVX2 /Zc:__cplusplus /EHsc- /GR- /link /STACK:4194304)
3737
set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
3838
endif ()
3939

dataset/README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
11
# flash-tokenizer/dataset
22

3-
[View documentation for the dataset directory](./DATA.md)
3+
[View documentation for the dataset directory](./DATA.md)
4+
5+
6+
dataset/data/texts_ko/texts_ko.txt
7+
8+
dataset/data/texts_ko/texts_ko.bert-base-cased.txt
Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,4 @@
1-
{"do_lower_case": false, "model_max_length": 512}
1+
{
2+
"do_lower_case": false,
3+
"model_max_length": 512
4+
}
3.03 MB
Binary file not shown.
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"architectures": [
3+
"BertForMaskedLM"
4+
],
5+
"attention_probs_dropout_prob": 0.1,
6+
"hidden_act": "gelu",
7+
"hidden_dropout_prob": 0.1,
8+
"hidden_size": 768,
9+
"initializer_range": 0.02,
10+
"intermediate_size": 3072,
11+
"layer_norm_eps": 1e-12,
12+
"max_position_embeddings": 512,
13+
"model_type": "bert",
14+
"num_attention_heads": 12,
15+
"num_hidden_layers": 12,
16+
"pad_token_id": 0,
17+
"tokenizer_class": "BertJapaneseTokenizer",
18+
"type_vocab_size": 2,
19+
"vocab_size": 32000
20+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"do_lower_case": false,
3+
"subword_tokenizer_type": "wordpiece",
4+
"word_tokenizer_type": "mecab",
5+
"model_max_length": 512
6+
}

0 commit comments

Comments
 (0)