Automatic extraction of Vossian antonomasia from large newspaper corpora.
(Shout-out to Gerardus Vossius, 1577–1649.)
We plot some temporal distributions:
echo "year articles found true prec"
for year in $(seq 1987 2007); do
echo $year \
$(grep ^$year articles.tsv | cut -d' ' -f2) \
$(../org.py -f year README.org | grep ${year} | wc -l) \
$(../org.py -f year,classification,status README.org | grep ${year} | awk -F$'\t' '{if ($3 == "D" || $2 == "True") print;}' | wc -l)
done
year | articles | found | true | prec | ppm |
---|---|---|---|---|---|
1987 | 106104 | 207 | 103 | 49.8 | 0.97 |
1988 | 104541 | 223 | 99 | 44.4 | 0.95 |
1989 | 102818 | 227 | 109 | 48.0 | 1.06 |
1990 | 98812 | 232 | 111 | 47.8 | 1.12 |
1991 | 85135 | 217 | 107 | 49.3 | 1.26 |
1992 | 82685 | 230 | 115 | 50.0 | 1.39 |
1993 | 79200 | 239 | 124 | 51.9 | 1.57 |
1994 | 74925 | 252 | 129 | 51.2 | 1.72 |
1995 | 85392 | 249 | 134 | 53.8 | 1.57 |
1996 | 79077 | 306 | 155 | 50.7 | 1.96 |
1997 | 85396 | 278 | 143 | 51.4 | 1.67 |
1998 | 89163 | 338 | 191 | 56.5 | 2.14 |
1999 | 91074 | 320 | 150 | 46.9 | 1.65 |
2000 | 94258 | 362 | 188 | 51.9 | 1.99 |
2001 | 96282 | 319 | 165 | 51.7 | 1.71 |
2002 | 97258 | 389 | 191 | 49.1 | 1.96 |
2003 | 94235 | 357 | 186 | 52.1 | 1.97 |
2004 | 91362 | 339 | 163 | 48.1 | 1.78 |
2005 | 90004 | 396 | 179 | 45.2 | 1.99 |
2006 | 87052 | 411 | 187 | 45.5 | 2.15 |
2007 | 39953 | 180 | 85 | 47.2 | 2.13 |
sum | 1854726 | 6071 | 3014 | 49.6 | 34.71 |
mean | 88320 | 289 | 144 | 49.8 | 1.63 |
: The temporal distribution of the number of found and true candidates.
reset
set datafile separator "\t"
set xlabel "year"
set ylabel "frequency"
set grid linetype 1 linecolor 0
set yrange [0:*]
set y2range [0:100]
set y2label 'precision'
set y2tics
set key top left
set style fill solid 1
set term svg enhanced size 800,600 dynamic fname "Palatino Linotype, Book Antiqua, Palatino, FreeSerif, serif" fsize 16
#set out "nyt_vossantos_over_time.svg"
plot data using 1:3 with linespoints pt 7 lc "red" title 'candidates',\
data using 1:4 with linespoints pt 7 lc "green" title 'Vossantos',\
data using 1:5 with lines lc "blue" title 'precision' axes x1y2
# data using 1:2 with linespoints pt 7 axes x1y2 title 'cand',\
# data using 1:3 with linespoints pt 7 axes x1y2 title 'wd',\
set term png enhanced size 800,600 font "Arial,16" lw 2
set out "nyt_vossantos_over_time.png"
replot
set key bottom left
set term pdf enhanced fontscale .7 lw 2
set out "nyt_vossantos_over_time.pdf"
replot
# ---- relative values
set key top left
set term svg enhanced size 800,600 dynamic fname "Palatino Linotype, Book Antiqua, Palatino, FreeSerif, serif" fsize 16
set out "nyt_vossantos_over_time_rel.svg"
set ylabel "frequency (per mille)"
set format y "%2.1f"
plot data using 1:($3/$2*1000) with linespoints pt 7 lc "red" title 'candidates',\
data using 1:($4/$2*1000) with linespoints pt 7 lc "green" title 'Vossantos',\
data using 1:5 with lines lc "blue" title 'precision' axes x1y2
set term png enhanced size 800,600 font "Arial,16" lw 2
set out "nyt_vossantos_over_time_rel.png"
replot
set term pdf enhanced lw 2
set out "nyt_vossantos_over_time_rel.pdf"
replot
Absolute frequency:
Relative frequency:
The most frequent sources are:
../org.py -T -f sourceUrl README.org | sort | uniq -c | sort -nr | head -n40
count | source |
---|---|
72 | Michael Jordan |
62 | Rodney Dangerfield |
40 | Johnny Appleseed |
36 | Elvis Presley |
36 | Babe Ruth |
25 | Michelangelo |
25 | Donald Trump |
23 | Pablo Picasso |
23 | Bill Gates |
23 | Madonna |
21 | Jackie Robinson |
20 | P. T. Barnum |
20 | Tiger Woods |
19 | Martha Stewart |
17 | William Shakespeare |
17 | Wolfgang Amadeus Mozart |
17 | Cinderella |
16 | Henry Ford |
16 | John Wayne |
15 | Napoleon |
14 | Leonardo da Vinci |
14 | Greta Garbo |
14 | Rosa Parks |
14 | Adolf Hitler |
14 | Mother Teresa |
14 | Ralph Nader |
13 | Cal Ripken |
12 | Willie Horton |
12 | Leo Tolstoy |
12 | Rembrandt |
12 | Oprah Winfrey |
12 | Susan Lucci |
11 | Walt Disney |
11 | Mike Tyson |
10 | Albert Einstein |
10 | Thomas Edison |
10 | Paul Revere |
10 | Julia Child |
10 | Cassandra |
9 | James Dean |
for year in $(seq 1987 2007); do
echo -n $year
for s in "Michael_Jordan" "Rodney_Dangerfield" "Johnny_Appleseed"; do
s=$(echo $s| sed "s/_/ /g")
c=$(../org.py -T -f year,sourceLabel README.org | grep ^$year | awk -F'\t' '{print $2}' | grep "^$s$" | wc -l)
echo -n "\t$c"
done
echo
done
year | Michael Jordan | Rodney Dangerfield | Johnny Appleseed |
---|---|---|---|
1987 | 0 | 0 | 2 |
1988 | 0 | 0 | 1 |
1989 | 1 | 1 | 1 |
1990 | 3 | 2 | 1 |
1991 | 4 | 1 | 1 |
1992 | 2 | 4 | 1 |
1993 | 3 | 4 | 2 |
1994 | 3 | 0 | 0 |
1995 | 0 | 1 | 3 |
1996 | 4 | 8 | 3 |
1997 | 1 | 3 | 1 |
1998 | 6 | 7 | 2 |
1999 | 11 | 2 | 3 |
2000 | 11 | 6 | 1 |
2001 | 7 | 5 | 1 |
2002 | 5 | 2 | 3 |
2003 | 2 | 1 | 3 |
2004 | 0 | 1 | 3 |
2005 | 2 | 8 | 4 |
2006 | 4 | 5 | 3 |
2007 | 3 | 1 | 1 |
reset
set datafile separator "\t"
set xlabel "year"
set ylabel "frequency"
set grid linetype 1 linecolor 0
set yrange [0:*]
set key top left
set style fill solid 1
set term svg enhanced size 800,600 dynamic font "Palatino Linotype, 16"
#set out "nyt_sources_over_time.svg"
plot data using 1:2 with linespoints pt 7 lw 2 title 'Michael Jordan',\
data using 1:3 with linespoints pt 7 title 'Rodney Dangerfield',\
data using 1:4 with linespoints pt 7 title 'Johnny Appleseed'
set term png enhanced size 800,600 font "Arial,16" lw 2
set out "nyt_sources_over_time.png"
replot
Extract the categories for the articles:
export PYTHONIOENCODING=utf-8
for year in $(seq 1987 2007); do
../nyt.py --category ../nyt_corpus_${year}.tar.gz \
| sed -e "s/^nyt_corpus_//" -e "s/\.har\//\//" -e "s/\.xml\t/\t/" \
| sort >> nyt_categories.tsv
done
Compute frequency distribution over all articles:
cut -d$'\t' -f2 nyt_categories.tsv | sort -S1G | uniq -c \
| sed -e "s/^ *//" -e "s/ /\t/" | awk -F'\t' '{print $2"\t"$1}' \
> nyt_categories_distrib.tsv
Check the number of and the top categories:
echo articles $(wc -l < nyt_categories.tsv)
echo categories $(wc -l < nyt_categories_distrib.tsv)
echo ""
sort -nrk2 nyt_categories_distrib.tsv | head
articles | 1854726 |
---|---|
categories | 1580 |
Business | 291982 |
Sports | 160888 |
Opinion | 134428 |
U.S. | 89389 |
Arts | 88460 |
World | 79786 |
Style | 65071 |
Obituaries | 19430 |
Magazine | 11464 |
Travel | 10440 |
Collect the categories of the articles
echo "vossantos" $(../org.py -T README.org | wc -l) articles $(wc -l < nyt_categories.tsv)
../org.py -T -f fId README.org | join nyt_categories.tsv - | sed "s/ /\t/" | awk -F'\t' '{print $2}' \
| sort | uniq -c \
| sed -e "s/^ *//" -e "s/ /\t/" | awk -F'\t' '{print $2"\t"$1}' \
| join -t$'\t' -o1.2,1.1,2.2 - nyt_categories_distrib.tsv \
| sort -nr | head -n20
vossantos | 3014 | category | articles | 1854726 |
---|---|---|---|---|
364 | 12.1% | Arts | 88460 | 4.8% |
362 | 12.0% | Sports | 160888 | 8.7% |
327 | 10.8% | New York and Region | 221897 | 12.0% |
287 | 9.5% | Arts; Books | 35475 | 1.9% |
186 | 6.2% | Movies; Arts | 27759 | 1.5% |
125 | 4.1% | Business | 291982 | 15.7% |
122 | 4.0% | Opinion | 134428 | 7.2% |
110 | 3.6% | U.S. | 89389 | 4.8% |
104 | 3.5% | Magazine | 11464 | 0.6% |
76 | 2.5% | Arts; Theater | 13283 | 0.7% |
70 | 2.3% | Style | 65071 | 3.5% |
52 | 1.7% | World | 79786 | 4.3% |
49 | 1.6% | Home and Garden; Style | 13978 | 0.8% |
37 | 1.2% | 42157 | 2.3% | |
36 | 1.2% | Travel | 10440 | 0.6% |
35 | 1.2% | Technology; Business | 23283 | 1.3% |
30 | 1.0% | Week in Review | 17107 | 0.9% |
29 | 1.0% | Home and Garden | 5546 | 0.3% |
18 | 0.6% | Style; Magazine | 1519 | 0.1% |
18 | 0.6% | Front Page; U.S. | 11425 | 0.6% |
Extract the desks for the articles:
export PYTHONIOENCODING=utf-8
for year in $(seq 1987 2007); do
../nyt.py --desk ../nyt_corpus_${year}.tar.gz \
| sed -e "s/^nyt_corpus_//" -e "s/\.har\//\//" -e "s/\.xml\t/\t/" \
| sort >> nyt_desks.tsv
done
Compute frequency distribution over all articles:
cut -d$'\t' -f2 nyt_desks.tsv | sort -S1G | uniq -c \
| sed -e "s/^ *//" -e "s/ /\t/" | awk -F'\t' '{print $2"\t"$1}' \
> nyt_desks_distrib.tsv
Check the number of and the top categories:
echo articles $(wc -l < nyt_desks.tsv)
echo categories $(wc -l < nyt_desks_distrib.tsv)
echo ""
sort -t$'\t' -nrk2 nyt_desks_distrib.tsv | head
articles | 1854727 |
---|---|
categories | 398 |
Metropolitan Desk | 237896 |
Financial Desk | 206958 |
Sports Desk | 174823 |
National Desk | 143489 |
Editorial Desk | 131762 |
Foreign Desk | 129732 |
Classified | 129660 |
Business/Financial Desk | 112951 |
Society Desk | 44032 |
Cultural Desk | 40342 |
Collect the desks of the articles
echo "vossantos" $(../org.py -T README.org | wc -l) articles $(wc -l < nyt_desks.tsv)
../org.py -T -f fid README.org | join nyt_desks.tsv - | sed "s/ /\t/" | awk -F'\t' '{print $2}' \
| sort | uniq -c \
| sed -e "s/^ *//" -e "s/ /\t/" | awk -F'\t' '{print $2"\t"$1}' \
| join -t$'\t' -o1.2,1.1,2.2 - nyt_desks_distrib.tsv \
| sort -nr | head -n20
vossantos | 3014 | desk | articles | 1854726 |
---|---|---|---|---|
381 | 12.6% | Sports Desk | 174823 | 9.4% |
222 | 7.4% | Metropolitan Desk | 237896 | 12.8% |
220 | 7.3% | Book Review Desk | 32737 | 1.8% |
180 | 6.0% | National Desk | 143489 | 7.7% |
171 | 5.7% | The Arts/Cultural Desk | 38136 | 2.1% |
169 | 5.6% | Arts and Leisure Desk | 27765 | 1.5% |
135 | 4.5% | Magazine Desk | 25433 | 1.4% |
125 | 4.1% | Editorial Desk | 131762 | 7.1% |
117 | 3.9% | Cultural Desk | 40342 | 2.2% |
99 | 3.3% | Movies, Performing Arts/Weekend Desk | 13929 | 0.8% |
96 | 3.2% | Business/Financial Desk | 112951 | 6.1% |
90 | 3.0% | Foreign Desk | 129732 | 7.0% |
78 | 2.6% | Weekend Desk | 18814 | 1.0% |
74 | 2.5% | Leisure/Weekend Desk | 10766 | 0.6% |
72 | 2.4% | Long Island Weekly Desk | 20453 | 1.1% |
69 | 2.3% | Style Desk | 21569 | 1.2% |
57 | 1.9% | Financial Desk | 206958 | 11.2% |
44 | 1.5% | Arts & Leisure Desk | 6742 | 0.4% |
42 | 1.4% | The City Weekly Desk | 22863 | 1.2% |
41 | 1.4% | Connecticut Weekly Desk | 17034 | 0.9% |
Note: there are many errors in the specification of the desks … so this table should be digested with care.
Extract the authors for the articles:
export PYTHONIOENCODING=utf-8
for year in $(seq 1987 2007); do
../nyt.py --author ../nyt_corpus_${year}.tar.gz \
| sed -e "s/^nyt_corpus_//" -e "s/\.har\//\//" -e "s/\.xml\t/\t/" \
| sort >> nyt_authors.tsv
done
Compute frequency distribution over all articles:
cut -d$'\t' -f2 nyt_authors.tsv | LC_ALL=C sort -S1G | uniq -c \
| sed -e "s/^ *//" -e "s/ /\t/" | awk -F'\t' '{print $2"\t"$1}' \
> nyt_authors_distrib.tsv
Check the number of and the top authors:
echo articles $(wc -l < nyt_authors.tsv)
echo categories $(wc -l < nyt_authors_distrib.tsv)
echo ""
sort -t$'\t' -nrk2 nyt_authors_distrib.tsv | head
articles | 1854726 |
---|---|
categories | 30691 |
961052 | |
Elliott, Stuart | 6296 |
Holden, Stephen | 5098 |
Chass, Murray | 4544 |
Pareles, Jon | 4090 |
Brozan, Nadine | 3741 |
Fabricant, Florence | 3659 |
Kozinn, Allan | 3654 |
Curry, Jack | 3654 |
Truscott, Alan | 3646 |
requires cleansing!
Collect the authors of the articles
echo "vossantos" $(../org.py -T README.org | wc -l) articles $(wc -l < nyt_authors.tsv)
../org.py -T -f fid README.org | join nyt_authors.tsv - | sed "s/ /\t/" | awk -F'\t' '{print $2}' \
| LC_ALL=C sort | uniq -c \
| sed -e "s/^ *//" -e "s/ /\t/" | awk -F'\t' '{print $2"\t"$1}' \
| LC_ALL=C join -t$'\t' -o1.2,1.1,2.2 - nyt_authors_distrib.tsv \
| sort -nr | head -n20
vossantos | 3014 | author | articles | 1854726 |
---|---|---|---|---|
470 | 15.6% | 961052 | 51.8% | |
34 | 1.1% | Maslin, Janet | 2874 | 0.2% |
32 | 1.1% | Holden, Stephen | 5098 | 0.3% |
30 | 1.0% | Vecsey, George | 2739 | 0.1% |
24 | 0.8% | Sandomir, Richard | 3140 | 0.2% |
24 | 0.8% | Dowd, Maureen | 1647 | 0.1% |
23 | 0.8% | Ketcham, Diane | 717 | 0.0% |
20 | 0.7% | Kisselgoff, Anna | 2661 | 0.1% |
20 | 0.7% | Brown, Patricia Leigh | 568 | 0.0% |
19 | 0.6% | Kimmelman, Michael | 1515 | 0.1% |
19 | 0.6% | Berkow, Ira | 1704 | 0.1% |
18 | 0.6% | Barron, James | 2188 | 0.1% |
17 | 0.6% | Stanley, Alessandra | 1437 | 0.1% |
17 | 0.6% | Pareles, Jon | 4090 | 0.2% |
17 | 0.6% | Lipsyte, Robert | 817 | 0.0% |
17 | 0.6% | Araton, Harvey | 1940 | 0.1% |
16 | 0.5% | Smith, Roberta | 2497 | 0.1% |
16 | 0.5% | Martin, Douglas | 1814 | 0.1% |
16 | 0.5% | Chass, Murray | 4544 | 0.2% |
15 | 0.5% | Grimes, William | 1368 | 0.1% |
# extract list of articles
for article in $(../org.py -T -f fid README.org | join nyt_authors.tsv - | grep "Maslin, Janet" | cut -d' ' -f1 ); do
grep "$article" README.org
done
../org.py -T -f modifier,aId README.org \
| awk -F$'\t' '$1 != "" {print $1;}' \
| sort | uniq -c | sort -nr | head -n30
count | modifier |
---|---|
56 | his day |
34 | his time |
29 | Japan |
17 | China |
16 | tennis |
16 | his generation |
16 | baseball |
14 | her time |
13 | our time |
13 | her day |
12 | the Zulus |
11 | the 90's |
11 | the 1990's |
11 | politics |
11 | hockey |
10 | the art world |
10 | Brazil |
10 | basketball |
10 | ballet |
9 | jazz |
9 | fashion |
8 | today |
8 | Iran |
8 | his era |
8 | hip-hop |
8 | golf |
8 | football |
8 | dance |
7 | the 19th century |
7 | Mexico |
Who are the sources for the modifier “today”?
../org.py -T -f modifier,sourceUrl README.org \
| awk -F$'\t' '$1 == "today" {print $2;}' \
| sort | uniq -c | sort -nr
count | source |
---|---|
1 | Shoeless Joe Jackson |
1 | Buck Rogers |
1 | Bill McGowan |
1 | William F. Buckley Jr. |
1 | Ralph Fiennes |
1 | Julie London |
1 | Jimmy Osmond |
1 | Harry Cohn |
Who are the sources for the modifiers “his day”, “his time”, and “his generation”?
../org.py -T -f modifier,sourceUrl README.org \
| awk -F$'\t' '$1 ~ "his (day|time|generation)" {print $2;}' \
| sort | uniq -c | sort -nr | head
count | source |
---|---|
3 | Donald Trump |
2 | Mike Tyson |
2 | Pablo Picasso |
2 | Billy Martin |
2 | Dan Quayle |
2 | Arnold Schwarzenegger |
2 | Martha Stewart |
2 | L. Ron Hubbard |
2 | Tiger Woods |
Who are the sources for the modifiers “her day”, “her time”, and “her generation”?
../org.py -T -f modifier,sourceUrl README.org \
| awk -F$'\t' '$1 ~ "her (day|time|generation)" {print $2;}' \
| sort | uniq -c | sort -nr | head
count | source |
---|---|
4 | Madonna |
2 | Laurie Anderson |
1 | Hilary Swank |
1 | Pamela Anderson |
1 | Hillary Clinton |
1 | Lotte Lehmann |
1 | Oprah Winfrey |
1 | Marilyn Monroe |
1 | Coco Chanel |
1 | Judith Krantz |
../org.py -T -f modifier,sourceUrl README.org \
| awk -F$'\t' '$1 ~ "(Japan|China|Brazil|Iran|Israel|Mexico|India|South Africa|Spain|South Korea|Russia|Poland|Pakistan)" {print $1;}' \
| sort | uniq -c | sort -nr | head
count | country |
---|---|
29 | Japan |
17 | China |
10 | Brazil |
8 | Iran |
7 | Mexico |
7 | Israel |
7 | India |
4 | South Africa |
4 | Poland |
3 | Spain |
What are the sources for the modifier … ?
../org.py -T -f modifier,sourceUrl README.org \
| awk -F$'\t' '$1 == "Japan" {print $2;}' \
| sort | uniq -c | sort -nr
count | source |
---|---|
5 | Walt Disney |
4 | Bill Gates |
2 | Nolan Ryan |
2 | Frank Sinatra |
1 | Richard Perle |
1 | Thomas Edison |
1 | Cal Ripken |
1 | Walter Johnson |
1 | Andy Warhol |
1 | Pablo Picasso |
1 | William Wyler |
1 | Stephen King |
1 | Brad Pitt |
1 | Richard Avedon |
1 | P. D. James |
1 | Rem Koolhaas |
1 | Steve Jobs |
1 | Ralph Nader |
1 | Madonna |
1 | Jack Kerouac |
../org.py -T -f modifier,sourceUrl README.org \
| awk -F$'\t' '$1 == "China" {print $2;}' \
| sort | uniq -c | sort -nr
count | source |
---|---|
4 | Barbara Walters |
2 | Jack Welch |
2 | Larry King |
1 | Louis XIV of France |
1 | Oskar Schindler |
1 | Napoleon |
1 | Keith Haring |
1 | Mikhail Gorbachev |
1 | Donald Trump |
1 | Ted Turner |
1 | Madonna |
1 | The Scarlet Pimpernel |
../org.py -T -f modifier,sourceUrl README.org \
| awk -F$'\t' '$1 == "Brazil" {print $2;}' \
| sort | uniq -c | sort -nr
count | source |
---|---|
1 | Giuseppe Verdi |
1 | Jil Sander |
1 | Walter Reed |
1 | Lech Wałęsa |
1 | Jim Morrison |
1 | Bob Dylan |
1 | Elvis Presley |
1 | Scott Joplin |
1 | Larry Bird |
1 | Pablo Escobar |
../org.py -T -f modifier,sourceUrl README.org \
| awk -F$'\t' '$1 ~ "(baseball|hockey|basketball|tennis|golf|football|racing|soccer|sailing)" {print $1;}' \
| sort | uniq -c | sort -nr
count | sports |
---|---|
16 | tennis |
16 | baseball |
11 | hockey |
10 | basketball |
8 | golf |
8 | football |
6 | soccer |
6 | racing |
3 | women’s basketball |
3 | sailing |
3 | auto racing |
2 | pro football |
2 | New York baseball |
1 | Yale football fame |
1 | women’s hockey |
1 | women’s college soccer |
1 | this year’s national collegiate basketball tournament |
1 | the tennis tour |
1 | the tennis field |
1 | the soccer set |
1 | the racing world |
1 | the Olympic hockey tournament |
1 | stock-car racing |
1 | Rotisserie baseball |
1 | pro football owners |
1 | professional basketball coaches |
1 | professional basketball |
1 | motocross racing in the 1980’s |
1 | micro golfers |
1 | major league baseball |
1 | Laser sailing |
1 | Japanese baseball |
1 | Iraqi soccer |
1 | horse racing |
1 | hockey in the former Soviet Union |
1 | hockey commentary |
1 | high school baseball in New York |
1 | harness racing |
1 | golf criticism |
1 | football teams |
1 | football owners |
1 | football announcers |
1 | European hockey |
1 | country-club golf |
1 | college football underclassmen |
1 | college football these days |
1 | college football |
1 | college basketball |
1 | Chinese baseball |
1 | Brazilian basketball for the past 20 years |
1 | BMX racing |
1 | biddy basketball |
1 | basketball announcers |
1 | basketball analysts |
1 | basketball analysis |
1 | baseball’s new era |
1 | baseball managers |
1 | baseball executives |
1 | baseball collections |
1 | baseball cards |
Who are the sources for the modifier … ?
../org.py -T -f modifier,sourceUrl README.org \
| awk -F$'\t' '$1 == "baseball" {print $2;}' \
| sort | uniq -c | sort -nr
count | source |
---|---|
2 | P. T. Barnum |
2 | Larry Bird |
1 | Clifford Irving |
1 | Mike Tyson |
1 | Thomas Dooley |
1 | Marco Polo |
1 | Pablo Picasso |
1 | Horatio Alger |
1 | Rodney Dangerfield |
1 | Michael Jordan |
1 | Alan Alda |
1 | Brandon Tartikoff |
1 | Howard Hughes |
1 | Thomas Jefferson |
../org.py -T -f modifier,sourceUrl README.org \
| awk -F$'\t' '$1 == "tennis" {print $2;}' \
| sort | uniq -c | sort -nr
count | source |
---|---|
2 | George Foreman |
1 | Tim McCarver |
1 | Pete Rose |
1 | Nolan Ryan |
1 | Crash Davis |
1 | Spike Lee |
1 | John Madden |
1 | Michael Jordan |
1 | John Wayne |
1 | George Hamilton |
1 | Michael Dukakis |
1 | Jackie Robinson |
1 | Babe Ruth |
1 | Dennis Rodman |
1 | Madonna |
../org.py -T -f modifier,sourceUrl README.org \
| awk -F$'\t' '$1 == "basketball" {print $2;}' \
| sort | uniq -c | sort -nr
count | source |
---|---|
2 | Babe Ruth |
1 | Joseph Stalin |
1 | Martin Luther King, Jr. |
1 | Pol Pot |
1 | Johnny Appleseed |
1 | Adolf Hitler |
1 | Bugsy Siegel |
1 | Elvis Presley |
1 | Chuck Yeager |
../org.py -T -f modifier,sourceUrl README.org \
| awk -F$'\t' '$1 == "football" {print $2;}' \
| sort | uniq -c | sort -nr
count | source |
---|---|
1 | Ann Calvello |
1 | Michael Jordan |
1 | Bobby Fischer |
1 | Patrick Henry |
1 | Susan Lucci |
1 | Jackie Robinson |
1 | Babe Ruth |
1 | Rich Little |
../org.py -T -f modifier,sourceUrl README.org \
| awk -F$'\t' '$1 == "racing" {print $2;}' \
| sort | uniq -c | sort -nr
count | source |
---|---|
2 | Rodney Dangerfield |
1 | John Madden |
1 | Bobo Holloman |
1 | Lou Gehrig |
1 | Wayne Gretzky |
../org.py -T -f modifier,sourceUrl README.org \
| awk -F$'\t' '$1 == "golf" {print $2;}' \
| sort | uniq -c | sort -nr
count | source |
---|---|
2 | Michael Jordan |
2 | Jackie Robinson |
1 | J. D. Salinger |
1 | James Brown |
1 | Marlon Brando |
1 | Babe Ruth |
../org.py -T -f modifier,sourceUrl README.org \
| awk -F$'\t' '$1 ~ "(dance|hip-hop|jazz|fashion|weaving|ballet|the art world|wine|salsa|juggling|tango)" {print $1;}' \
| sort | uniq -c | sort -nr | head -n13
count | modifier |
---|---|
10 | the art world |
10 | ballet |
9 | jazz |
9 | fashion |
8 | hip-hop |
8 | dance |
4 | wine |
4 | salsa |
2 | the hip-hop world |
2 | the fashion world |
2 | the fashion industry |
2 | the dance world |
2 | juggling |
../org.py -T -f sourceLabel,modifier README.org \
| awk -F$'\t' '{if ($1 == "Michael Jordan") print $2}' \
| sort -u
the Michael Jordan of