summaryrefslogtreecommitdiff
path: root/dev-python/nltk-data/nltk-data-20221108.ebuild
blob: 94c3c1e478d78702d1336878c6f28df0430464e8 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
# Copyright 2020-2022 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2

EAPI=8

inherit check-reqs

DESCRIPTION="Data files for NLTK"
HOMEPAGE="https://www.nltk.org/nltk_data/"

# at least some of the files have poorly documented licenses
# https://github.com/nltk/nltk_data/issues/102
# TODO: create a USE flag for free-ish subset
LICENSE="all-rights-reserved"
SLOT="0"
KEYWORDS="amd64 ~ppc64 ~riscv x86"
IUSE="extra"
RESTRICT="bindist mirror"

BDEPEND="app-arch/unzip"

# https://github.com/nltk/nltk_data/commits/gh-pages

PACKAGES_ZIP_2020=(
	# wget -O - https://www.nltk.org/nltk_data/ | xml sel -t -m '//package[@unzip=0]' -v @subdir -o "/" -v @id -n - | sort
	corpora/comtrans
	corpora/conll2007
	corpora/jeita
	corpora/knbc
	corpora/machado
	corpora/masc_tagged
	corpora/nombank.1.0
	corpora/panlex_swadesh
	corpora/propbank
	corpora/reuters
	corpora/semcor
	corpora/universal_treebanks_v20
	sentiment/vader_lexicon
	stemmers/snowball_data
)

PACKAGES_UNPACK_2020=(
	# wget -O - https://www.nltk.org/nltk_data/ | xml sel -t -m '//package[@unzip=1]' -v @subdir -o "/" -v @id -n - | sort
	corpora/abc
	corpora/alpino
	corpora/brown
	corpora/cess_cat
	corpora/cess_esp
	corpora/chat80
	corpora/city_database
	corpora/cmudict
	corpora/comparative_sentences
	corpora/conll2000
	corpora/conll2002
	corpora/crubadan
	corpora/dependency_treebank
	corpora/dolch
	corpora/europarl_raw
	corpora/floresta
	corpora/framenet_v15
	corpora/framenet_v17
	corpora/gazetteers
	corpora/genesis
	corpora/gutenberg
	corpora/ieer
	corpora/indian
	corpora/lin_thesaurus
	corpora/mac_morpho
	corpora/movie_reviews
	corpora/mte_teip5
	corpora/names
	corpora/nonbreaking_prefixes
	corpora/nps_chat
	corpora/omw
	corpora/opinion_lexicon
	corpora/pl196x
	corpora/ppattach
	corpora/product_reviews_1
	corpora/product_reviews_2
	corpora/pros_cons
	corpora/ptb
	corpora/qc
	corpora/rte
	corpora/senseval
	corpora/sentence_polarity
	corpora/sentiwordnet
	corpora/shakespeare
	corpora/state_union
	corpora/subjectivity
	corpora/swadesh
	corpora/switchboard
	corpora/timit
	corpora/toolbox
	corpora/treebank
	corpora/twitter_samples
	corpora/udhr
	corpora/udhr2
	corpora/verbnet
	corpora/webtext
	corpora/wordnet
	corpora/wordnet_ic
	corpora/words
	grammars/book_grammars
	grammars/large_grammars
	grammars/sample_grammars
	misc/perluniprops
	models/bllip_wsj_no_aux
	models/moses_sample
	models/wmt15_eval
	models/word2vec_sample
	stemmers/porter_test
	stemmers/rslp
	taggers/averaged_perceptron_tagger
	taggers/averaged_perceptron_tagger_ru
)

PACKAGES_UNPACK_2021_12=(
	corpora/inaugural
	corpora/omw-1.4
	corpora/wordnet2021
	corpora/wordnet31
	corpora/sinica_treebank
)

PACKAGES_UNPACK_2022=(
	corpora/stopwords
	taggers/universal_tagset
)

PACKAGES_UNPACK_2022_11=(
	tokenizers/punkt
)

PACKAGES_UNPACK_EXTRA_2020=(
	chunkers/maxent_ne_chunker
	corpora/biocreative_ppi
	corpora/brown_tei
	corpora/kimmo
	corpora/paradigms
	corpora/pe08
	corpora/pil
	corpora/problem_reports
	corpora/smultron
	corpora/unicode_samples
	corpora/verbnet3
	corpora/ycoe
	grammars/basque_grammars
	grammars/spanish_grammars
	help/tagsets
	misc/mwa_ppdb
	taggers/maxent_treebank_pos_tagger
)

PACKAGES_ZIP_EXTRA_2022=(
	corpora/extended_omw
)

add_data() {
	local x version=${1}
	shift

	for x; do
		SRC_URI+="
			https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/${x}.zip
				-> nltk-${x#*/}-${version}.zip"
	done
}

add_data 20200312 "${PACKAGES_ZIP_2020[@]}" "${PACKAGES_UNPACK_2020[@]}"
add_data 20211221 "${PACKAGES_UNPACK_2021_12[@]}"
add_data 20220704 "${PACKAGES_UNPACK_2022[@]}"
add_data 20221108 "${PACKAGES_UNPACK_2022_11[@]}"
SRC_URI+="
	extra? ("
add_data 20200312 "${PACKAGES_UNPACK_EXTRA_2020[@]}"
add_data 20220704 "${PACKAGES_ZIP_EXTRA_2022[@]}"
SRC_URI+="
	)"

CHECKREQS_DISK_USR=3G
CHECKREQS_DISK_BUILD=${CHECKREQS_DISK_USR}

unpack_data() {
	local x version=${1}
	shift

	for x; do
		local cat=${x%/*}
		local pkg=${x#*/}

		mkdir -p "${S}/${cat}" || die
		cd "${S}/${cat}" || die
		unpack "nltk-${pkg}-${version}.zip"
	done
}

src_unpack() {
	unpack_data 20200312 "${PACKAGES_UNPACK_2020[@]}"
	unpack_data 20211023 "${PACKAGES_UNPACK_2021[@]}"
	unpack_data 20211221 "${PACKAGES_UNPACK_2021_12[@]}"
	unpack_data 20220704 "${PACKAGES_UNPACK_2022[@]}"
	unpack_data 20221108 "${PACKAGES_UNPACK_2022_11[@]}"
	if use extra; then
		unpack_data 20200312 "${PACKAGES_UNPACK_EXTRA_2020[@]}"
	fi
}

install_zips() {
	local x version=${1}
	shift

	for x; do
		local cat=${x%/*}
		local pkg=${x#*/}

		insinto "/usr/share/nltk_data/${cat}"
		newins "${DISTDIR}/nltk-${pkg}-${version}.zip" "${pkg}.zip"
	done
}

src_install() {
	dodir /usr/share/nltk_data
	mv * "${ED}/usr/share/nltk_data/" || die

	install_zips 20200312 "${PACKAGES_ZIP_2020[@]}"
	if use extra; then
		install_zips 20220704 "${PACKAGES_ZIP_EXTRA_2022[@]}"
	fi
}