commit 1f69645e31b5f38112710e4ffc05c95709659705
parent 772e850cefefecf26b594618e4445dc3511a6f20
Author: Nikolay Korotkiy <sikmir@gmail.com>
Date: Sat, 12 Dec 2020 17:03:04 +0300
Add tatoeba dicts
Diffstat:
3 files changed, 140 insertions(+), 0 deletions(-)
diff --git a/pkgs/data/dicts/tatoeba/default.nix b/pkgs/data/dicts/tatoeba/default.nix
@@ -0,0 +1,61 @@
+{ stdenvNoCC, lib, fetchurl, dict, jq, moreutils, stardict-tools, tatoebatools }:
+let
+ langs = [
+ "deu eng"
+ "deu rus"
+ "eng epo"
+ "eng deu"
+ "eng fin"
+ "eng rus"
+ "epo eng"
+ "epo rus"
+ "fin eng"
+ "fin rus"
+ "rus deu"
+ "rus eng"
+ "rus epo"
+ "rus fin"
+ ];
+ tatoeba = builtins.fromJSON (builtins.readFile ./tatoeba.json);
+in
+stdenvNoCC.mkDerivation rec {
+ pname = "tatoeba";
+ version = "2020-12-05";
+
+ srcs = lib.mapAttrsToList (name: spec: fetchurl spec) tatoeba;
+
+ unpackPhase = ''
+ echo "{}" > versions.json
+ '' + lib.concatMapStringsSep "\n" (src: ''
+ bzcat ${src} > ${lib.removeSuffix ".bz2" src.name}
+ jq '.+{"${lib.removeSuffix ".tsv.bz2" src.name}":"${version} 00:00:00"}' versions.json | \
+ sponge versions.json
+ '') srcs;
+
+ nativeBuildInputs = [ jq moreutils stardict-tools dict tatoebatools ];
+
+ buildPhase = let
+ makeDict = lang: with lib; ''
+ parallel_corpus ${lang} > tatoeba_${replaceStrings [ " " ] [ "_" ] lang}.tab
+ stardict-tabfile tatoeba_${replaceStrings [ " " ] [ "_" ] lang}.tab
+ '';
+ in ''
+ export XDG_DATA_HOME=$PWD
+ mkdir -p tatoebatools/{links,sentences_detailed}
+ mv *_links.tsv tatoebatools/links
+ mv *_sentences_detailed.tsv tatoebatools/sentences_detailed
+ mv versions.json tatoebatools
+ ${lib.concatMapStringsSep "\n" makeDict langs}
+ '';
+
+ installPhase = "install -Dm644 *.{dict.dz,idx,ifo} -t $out";
+
+ meta = with lib; {
+ description = "Tatoeba is a collection of sentences and translations";
+ homepage = "https://tatoeba.org/";
+ license = licenses.free;
+ maintainers = with maintainers; [ sikmir ];
+ platforms = platforms.all;
+ skip.ci = true;
+ };
+}
diff --git a/pkgs/data/dicts/tatoeba/tatoeba.json b/pkgs/data/dicts/tatoeba/tatoeba.json
@@ -0,0 +1,78 @@
+{
+ "deu_sentences_detailed": {
+ "url": "https://downloads.tatoeba.org/exports/per_language/deu/deu_sentences_detailed.tsv.bz2",
+ "sha256": "119v8pfs1rwm08f3cl162yxb37jqsf7r29kcl9gffmvxdf9bnn2m"
+ },
+ "eng_sentences_detailed": {
+ "url": "https://downloads.tatoeba.org/exports/per_language/eng/eng_sentences_detailed.tsv.bz2",
+ "sha256": "0fxzb5mqnzczk5vsjrw4fnmg2g19xja8h4ngj8h5as11ymc5v360"
+ },
+ "epo_sentences_detailed": {
+ "url": "https://downloads.tatoeba.org/exports/per_language/epo/epo_sentences_detailed.tsv.bz2",
+ "sha256": "1a3w2cwspafz4g0lkwiq6p4bdwz4xikih579iflax2g4zvk1xnhz"
+ },
+ "fin_sentences_detailed": {
+ "url": "https://downloads.tatoeba.org/exports/per_language/fin/fin_sentences_detailed.tsv.bz2",
+ "sha256": "1x7ivh2f5d5nf4cxdlji934rvg60q9sbzg00n4mxsp9axjjsq8vd"
+ },
+ "rus_sentences_detailed": {
+ "url": "https://downloads.tatoeba.org/exports/per_language/rus/rus_sentences_detailed.tsv.bz2",
+ "sha256": "1rs0pj0a02b0cb3wh3anhq306s7f30g39ig3kcpw18jscncbbgb8"
+ },
+ "deu-eng_links": {
+ "url": "https://downloads.tatoeba.org/exports/per_language/deu/deu-eng_links.tsv.bz2",
+ "sha256": "04wiml623prpna9x7w8r2bl0br0fpar9c0zsqjf8v7jm8hcqnn9g"
+ },
+ "deu-rus_links": {
+ "url": "https://downloads.tatoeba.org/exports/per_language/deu/deu-rus_links.tsv.bz2",
+ "sha256": "1lzz9mg19kl4p5gy1fjinlhbkbqc9pgdbmw17z47yf62pyf7hzk5"
+ },
+ "eng-epo_links": {
+ "url": "https://downloads.tatoeba.org/exports/per_language/eng/eng-epo_links.tsv.bz2",
+ "sha256": "1k2mg81z5191m2m8p2kr7ghwiakxpjvr8by2gn1fdyi0sh77pfwi"
+ },
+ "eng-deu_links": {
+ "url": "https://downloads.tatoeba.org/exports/per_language/eng/eng-deu_links.tsv.bz2",
+ "sha256": "0ag10iv8v829ny6bfdldcw2wzil4q8803h265934ynydmjg5dh54"
+ },
+ "eng-fin_links": {
+ "url": "https://downloads.tatoeba.org/exports/per_language/eng/eng-fin_links.tsv.bz2",
+ "sha256": "09rci2696xjp5pjb2w4w0m8njczycy2i9kdri2qzvw5hj9rp1cmh"
+ },
+ "eng-rus_links": {
+ "url": "https://downloads.tatoeba.org/exports/per_language/eng/eng-rus_links.tsv.bz2",
+ "sha256": "0lxbz5pfx6f11l24kixsw1w1xy2w1rwmnprm0bfamhzbir1i8lm6"
+ },
+ "epo-eng_links": {
+ "url": "https://downloads.tatoeba.org/exports/per_language/epo/epo-eng_links.tsv.bz2",
+ "sha256": "0dhgr07wsxnarvd1rqx7l289fhgs1jwr5znjvbwgqbcp3c87cfbd"
+ },
+ "epo-rus_links": {
+ "url": "https://downloads.tatoeba.org/exports/per_language/epo/epo-rus_links.tsv.bz2",
+ "sha256": "0zmvxh76vy9ghq6ag8kcng0jf77p07yv37dvhwigv0ngjwi03hmc"
+ },
+ "fin-eng_links": {
+ "url": "https://downloads.tatoeba.org/exports/per_language/fin/fin-eng_links.tsv.bz2",
+ "sha256": "07mm92z5d1vdqmchx4ii8q0ndlar76p64cn3m5hhxlzl3z7cgd6p"
+ },
+ "fin-rus_links": {
+ "url": "https://downloads.tatoeba.org/exports/per_language/fin/fin-rus_links.tsv.bz2",
+ "sha256": "1ryja7587cmvyh63jcmpnqis7nlwmk6igbllirfsrflyja2q108p"
+ },
+ "rus-deu_links": {
+ "url": "https://downloads.tatoeba.org/exports/per_language/rus/rus-deu_links.tsv.bz2",
+ "sha256": "0azrlak6hr0mf6ghkms9ppqy2bcmv82ra6dfyw482n7qg8yjp48k"
+ },
+ "rus-eng_links": {
+ "url": "https://downloads.tatoeba.org/exports/per_language/rus/rus-eng_links.tsv.bz2",
+ "sha256": "14fccxghialf18pcxlij37l3gb2ywzqc6mlqjdgxd6m1inayldpk"
+ },
+ "rus-epo_links": {
+ "url": "https://downloads.tatoeba.org/exports/per_language/rus/rus-epo_links.tsv.bz2",
+ "sha256": "0rry16qmnpbqhnyminnqafwd0fy4y1rhb1cpwwy25mjlhfbhp0wd"
+ },
+ "rus-fin_links": {
+ "url": "https://downloads.tatoeba.org/exports/per_language/rus/rus-fin_links.tsv.bz2",
+ "sha256": "0qx2fvysxxdgazqn2g0jf4n3i2rkz42ih54cw4lnw5ah26qwq0ks"
+ }
+}
diff --git a/pkgs/default.nix b/pkgs/default.nix
@@ -64,6 +64,7 @@ lib.makeScope newScope (
freedict = callPackage ./data/dicts/freedict { };
huzheng = callPackage ./data/dicts/huzheng { };
it-sanasto = callPackage ./data/dicts/it-sanasto { };
+ tatoeba = callPackage ./data/dicts/tatoeba { };
wiktionary = callPackage ./data/dicts/wiktionary { };
gpsmap64 = callPackage ./data/firmwares/gpsmap64 { };