Commit 6f78640e authored by Loïc Grobol's avatar Loïc Grobol
Browse files

improve stats

parent 38ca324d
Pipeline #8626 passed with stages
in 1 minute and 56 seconds
......@@ -30,13 +30,18 @@ def main(corpus_path: pathlib.Path):
trees: Dict[str, Tuple[str, conllu.TokenList]] = dict()
print("Per file:")
total_tokens = 0
total_trees = 0
for f in corpus_path.glob("*.conllu"):
with open(f) as in_stream:
file_trees = list(conllu.parse_incr(in_stream))
trees.update((t.metadata["sent_id"], (f.stem, t)) for t in file_trees)
n_trees = len(file_trees)
n_tokens = sum(len(t) for t in file_trees)
n_trees = len(file_trees)
total_tokens += n_tokens
total_trees += n_trees
print("Tokens repartition:")
with open(corpus_path / "split.json") as in_stream:
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment