blob: 8af2e67a9a3b09426ccb63eac0a43b702b27f655 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
|
# Copyright 2020-2022 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2
EAPI=8
inherit check-reqs
DESCRIPTION="Data files for NLTK"
HOMEPAGE="https://www.nltk.org/nltk_data/"
# at least some of the files have poorly documented licenses
# https://github.com/nltk/nltk_data/issues/102
# TODO: create a USE flag for free-ish subset
LICENSE="all-rights-reserved"
SLOT="0"
KEYWORDS="amd64 ~ppc64 ~riscv x86"
RESTRICT="bindist mirror"
BDEPEND="app-arch/unzip"
# https://github.com/nltk/nltk_data/commits/gh-pages
PACKAGES_ZIP=(
# wget -O - https://www.nltk.org/nltk_data/ | xml sel -t -m '//package[@unzip=0]' -v @subdir -o "/" -v @id -n - | sort
corpora/bcp47
corpora/comtrans
corpora/conll2007
corpora/extended_omw
corpora/jeita
corpora/knbc
corpora/machado
corpora/masc_tagged
corpora/nombank.1.0
corpora/omw
corpora/omw-1.4
corpora/panlex_swadesh
corpora/propbank
corpora/reuters
corpora/semcor
corpora/universal_treebanks_v20
corpora/wordnet
corpora/wordnet2021
corpora/wordnet31
sentiment/vader_lexicon
stemmers/snowball_data
)
PACKAGES_UNPACK=(
# wget -O - https://www.nltk.org/nltk_data/ | xml sel -t -m '//package[@unzip=1]' -v @subdir -o "/" -v @id -n - | sort
chunkers/maxent_ne_chunker
corpora/abc
corpora/alpino
corpora/biocreative_ppi
corpora/brown
corpora/brown_tei
corpora/cess_cat
corpora/cess_esp
corpora/chat80
corpora/city_database
corpora/cmudict
corpora/comparative_sentences
corpora/conll2000
corpora/conll2002
corpora/crubadan
corpora/dependency_treebank
corpora/dolch
corpora/europarl_raw
corpora/floresta
corpora/framenet_v15
corpora/framenet_v17
corpora/gazetteers
corpora/genesis
corpora/gutenberg
corpora/ieer
corpora/inaugural
corpora/indian
corpora/kimmo
corpora/lin_thesaurus
corpora/mac_morpho
corpora/movie_reviews
corpora/mte_teip5
corpora/names
corpora/nonbreaking_prefixes
corpora/nps_chat
corpora/opinion_lexicon
corpora/paradigms
corpora/pe08
corpora/pil
corpora/pl196x
corpora/ppattach
corpora/problem_reports
corpora/product_reviews_1
corpora/product_reviews_2
corpora/pros_cons
corpora/ptb
corpora/qc
corpora/rte
corpora/senseval
corpora/sentence_polarity
corpora/sentiwordnet
corpora/shakespeare
corpora/sinica_treebank
corpora/smultron
corpora/state_union
corpora/stopwords
corpora/subjectivity
corpora/swadesh
corpora/switchboard
corpora/timit
corpora/toolbox
corpora/treebank
corpora/twitter_samples
corpora/udhr
corpora/udhr2
corpora/unicode_samples
corpora/verbnet
corpora/verbnet3
corpora/webtext
corpora/wordnet2022
corpora/wordnet_ic
corpora/words
corpora/ycoe
grammars/basque_grammars
grammars/book_grammars
grammars/large_grammars
grammars/sample_grammars
grammars/spanish_grammars
help/tagsets
misc/mwa_ppdb
misc/perluniprops
models/bllip_wsj_no_aux
models/moses_sample
models/wmt15_eval
models/word2vec_sample
stemmers/porter_test
stemmers/rslp
taggers/averaged_perceptron_tagger
taggers/averaged_perceptron_tagger_ru
taggers/maxent_treebank_pos_tagger
taggers/universal_tagset
tokenizers/punkt
)
add_data() {
local data=${1}
for data; do
SRC_URI+="
https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/${data}.zip
-> nltk-${data#*/}-${PV}.zip"
done
}
add_data "${PACKAGES_ZIP[@]}" "${PACKAGES_UNPACK[@]}"
CHECKREQS_DISK_USR=3G
CHECKREQS_DISK_BUILD=${CHECKREQS_DISK_USR}
unpack_data() {
local data=${1}
for data; do
local cat=${data%/*}
local pkg=${data#*/}
mkdir -p "${S}/${cat}" || die
cd "${S}/${cat}" || die
unpack "nltk-${pkg}-${PV}.zip"
done
}
src_unpack() {
unpack_data "${PACKAGES_UNPACK[@]}"
}
install_zips() {
local data=${1}
for data; do
local cat=${data%/*}
local pkg=${data#*/}
insinto "/usr/share/nltk_data/${cat}"
newins "${DISTDIR}/nltk-${pkg}-${PV}.zip" "${pkg}.zip"
done
}
src_install() {
dodir /usr/share/nltk_data
mv * "${ED}/usr/share/nltk_data/" || die
install_zips "${PACKAGES_ZIP[@]}"
}
|