#takes over a month
for i in {0..127};
do
zcat /da5_data/play/cite/b2f$i | ~/lookup/showCnt1 blob | grep --color=auto -iE 'zenodo|doi|article|proceedings|journal|conference' | gzip
> /da5_data/play/cite/b2f$i.dat;
done
for i in {0..127};
do
./doi.sh $i;
done &
# how many references
for i in {0..127};
do zcat b2f$i.dat| grep -Ei 'doi\.org/' | perl -ane 's|%2F|/|g;($b,$f,@S)=split(/;/);$s=join " ", @S;$r="";while($s=~m|\b[dD][oO][iI]\.[oO][Rr][gG]/([/0-9\.a-zA-Z\-_"`]*)|g){$r.="|$1"}; print "$b;$f;$r\n" if $r ne "";' | gzip > b2f$i.doi1
done
for i in {0..127}
do (zcat b2f$i.doi | cut -d\; -f1,3 | perl cleanDOI.perl 1; zcat b2f$i.doi1 | cut -d\; -f1,3 | perl -ane 'chop();s/;\|/;/;s|%2F|/|g;s/["`]//g;($p,$d)=split(/;/);@ds=split(/\|/,$d); for $d1 (@ds){print "$p;$d1\n" if $d1 ne "https" && $d1 ne "/"};') | lsort 10G -t\; -k1,2 -u| join -t\; - <(zcat /da7_data/basemaps/gz/b2P128FullV.$i.gz)
done |cut -d\; -f2-3 | lsort 50G -t\; -k1,1 | gzip > doi2P
zcat b2f$i.doi | cut -d\; -f1,3 | perl cleanDOIdbg.perl 1| grep ';10.1016;' |cut -c1-200|head
zcat b2f$i.doi1 | cut -d\; -f1,3 | perl -ane 'chop();s/;\|/;/;s/["`]//g;s|%2F|/|g;($p,$d)=split(/;/);@ds=split(/\|/,$d); for $d1 (@ds){print "$p;$d1;$d\n"};' | grep ';10.1016;' |cut -c1-200|head
#Do get numbers of projects associated
(for i in {0..31}; do zcat /da1_data/basemaps/gz/P2summFullV$i.json; done | perl -ane '$l=$_;$l=~m|"ProjectID":"([^"]*)|; $p=$1; $na=0;if($l=~m|"NumAuthors":([0-9]*)|){$na=$1;};$nc=0;if($l=~m|"NumCommits":([0-9]*)|){$nc=$1};print "$p;$na;$nc\n"') | lsort 50G | gzip > P2nAc
cat b2f*nCperP | awk '{print $2";"$1}' | lsort 1G -t\; -k1,1 | join -t\; - <(zcat P2nAc |lsort 100G -t\; -k1,1) > b2fStat
cat b2fStat|lsort 10G -t\; -rn -k4 | uniq | awk -F\; '{if ($2 < 3) print $0}'|head -90
#find blobs with references that are widely spread among repos
zcat b2f*.doiP | lsort 10G -t\; -k1,2 | uniq | cut -d\; -f1 | uniq -c | lsort 1G -rn | head -50 > topDOI
zcat b2f*.bibP | lsort 10G -t\; -k1,2 | uniq | cut -d\; -f1 | uniq -c | lsort 1G -rn | head -50 > topBIB
3b9cc7b20317766a22c02a151c9ec0e79692a3d4
# The International Earth Rotation and Reference Systems Service (IERS)
# periodically uses leap seconds to keep UTC to within 0.9 s of UT1
# (a proxy for Earth's angle in space as measured by astronomers)
# and publishes leap second data in a copyrighted file
# <https://hpiers.obspm.fr/iers/bul/bulc/Leap_Second.dat>.
# See: Levine J. Coordinated Universal Time and the leap second.
# URSI Radio Sci Bull. 2016;89(4):30-6. doi:10.23919/URSIRSB.2016.7909995
zcat fatcat_bulk_exports_2024-02-18/release_export_expanded.json.gz | grep --color=auto -i 'Coordinated Universal Time and the leap secon' > seconds
zcat sematicscolar/papers-part*.jsonl.gz | grep --color=auto -i 'Coordinated Universal Time and the leap sec' > seconds1
# find most widely spread doi
zcat doi2P|perl -ane 'chop();($d,$p)=split(/;/);@ds=split(/|/,$d);for $d1 (@ds){print "$d1;$p\n"}' | lsort 30G -t\; -k1,2 | cut -d\; -f1 |uniq -c|lsort 10G -rn |head
144697
68851 10.1007/978
55034 10.23919/URSIRSB.2016.7909995
49387 10.1038/nmeth.3252|
25950 |
24778 10.1016
21869 10.1007
20867 10.1088
19124 10.1038/s41467
16512 10.1146/annurev
10.1038/nmeth.3252;6112893; Perspective Published: 29 January 2015; Orchestrating high-throughput genomic analysis with Bioconductor
{"corpusid":209884733,"externalids":{"ACL":null,"DBLP":null,"ArXiv":null,"MAG":"2795565782","CorpusId":"209884733","PubMed":null,"DOI":null,"PubMedCentral":null},"url":"https://www.semanticscholar.org/paper/980af8340bf44d3e5621addc82c6cac0e6d508ca","title":"Coordinated Universal Time and the Leap Second | NIST","authors":[{"authorId":"143842258","name":"J. Levine"}],"venue":"","p
ublicationvenueid":null,"year":2016,"referencecount":0,"citationcount":1,"influentialcitationcount":1,"isopenaccess":false,"s2fieldsofstudy":[{"category":"Physics","source":"s2-fos-model"},{"category":"Physics","source":"external"}],"publicationtypes":null,"publicationdate":"2016-12-01","journal":{"name":"Radio Science","pages":null,"volume":""}}
cat top
:6112893,
:209884733,
zcat sematicscolar/citations-part*.jsonl.gz | grep --color=auto -Ff top > top.citations
{"citationid":4740408346,"citingcorpusid":268754447,"citedcorpusid":209884733,"isinfluential":true,"contexts":["To ensure agreement between UTC and the time derived from the Earth's rotation (UT1), TAI is compared with UT1, if the difference is greater than 0.9 seconds, a Leap Second is applied (Levine, 2016)."],"intents":[["methodology"]]}
#doi 2 corpid
zcat sematicscolar/papers-part*.jsonl.gz | perl -ane'$s=$_;$cid="";if ($s=~m/"corpusid":([0-9]*)/){$cid=$1;if ($s=~m/"DOI":"([^"]*)"/){$doi=$1;print "$doi;$cid\n";}}' | gzip > doi2cid
Top by blob is is 10.23919/URSIRSB.2016.7909995
zcat doi2P | grep 10.23919/URSIRSB.2016.7909995
# now get cids and then impatct from citations_part*
###########################################################################
# clean doi
for i in {0..127}; do zcat b2f$i.doi; done | sed 's/;|/;/' | cut -d\; -f3 | perl -ane 's/\|/\n/g;print' | gzip > all.git.doi &
zcat all.git.doi | grep -i
zcat all.git.doi | perl -an cleanDOI.perl | cut -c1-300
sort -rn | awk '{print $2";"$1}' | ~/lookup/showCnt blob 2| grep -EiA2 '(doi\b|@article{|@incollection)'| less
# Now look at what papers cite these top references
#how many blobs with
#a) zenodo
cat b2f*.zenodo| lsort 10G -u | wc
436177
#b) arxiv
cat b2f*.axiv| lsort 10G -u | wc
426263
#c) doi
for i in {0..125}; do zcat b2f$i.dat; done | grep -i 'doi\b' | grep '\bdoi\.'| gzip > doi.gz
for i in {0..125}; do zcat b2f$i.dat; done | grep -Ei '\@(article|inproceedings|book)' | gzip > bib.gz
zcat doi.gz | grep -i 'doi\b' | grep '\bdoi\.'| cut -d\; -f1 | uniq | lsort 10G -u | wc -l
1280071
zcat bib.gz|cut -d\; -f1 |lsort 10G -u | wc -l
923282
zcat doi.gz | perl -ane 'while(|([^\. ]*[\./]doi\.[^\.])|g){ print "$1\n";}'
zcat doi.gz | perl -ane 'while(m|([^\. \(\{]*[\./]doi\.[^\.]*\.[^\)\( ,}"]*)|g){ $s=$1; $s=~s/\n//;$s=~s|.*https://||;$s=~s|.*http://||;print "$s\n";}' |lsort 1G -u > doi.u
zcat release_export_expanded.json.gz| grep -E 'bitbucket|gitlab|github' | gzip > bbglgh
zcat release_export_expanded.json.gz| grep -E 'https?://' | gzip > url
zcat bbglgh| perl -ane 'chop();$s=$_;$s0=$s;$s0=~m|"work_id":"([^"]*)|;$s0=$1;while($s=~m|/github.com/([^/]*/[^/ "},:\.\)\(&]*)|g){$v=$1;$v=~s/[,"].*//;print "$s0;$v\n"}' | gzip > bbglg.gh
#do counting for woc zcat fatcat_bulk_exports_2024-02-18/url.u | grep -E 'org/swsc|ssc-oscar|woc-hack' #nothing zcat sematicscolar/papers-part*.jsonl.gz | grep --color=auto -i mockus > mockus.papers grep -i 'world of c' mockus.papers|cut -d, -f1|cut -d: -f2 |sed 's|^|:|;s|$|,|' > woc
zcat sematicscolar/citations-partjsonl.gz| grep -Ff woc > woc.citations grep -Ff woc1 woc.citations | sed 's|."citingcorpusid":||;s|,.||' | sort -u | sed 's|^|:|;s|$|,|' > woc1.citing zcat sematicscolar/papers-part.jsonl.gz | grep -Ff woc1.citing > woc.citations.papers zcat sematicscolar/citations-part*jsonl.gz| grep -Ff woc1.citing > woc.citations.citing
zcat fatcat_bulk_exports_2024-02-18/release_export_expanded.json.gz |grep -i mockus > mockus.papers.fatcat grep -i 'world of c' mockus.papers.fatcat| sed 's|^."work_id":"([^"])"|\1|;s|,"title":"([^"])"|;\1|;s|,"state":"active",||;s|"ident":"([^"])".*|;\1|' > idList (cut -d; -f1 woc.idList; cut -d; -f3 woc.idList)|sort -u > woc.idList1 grep -Ff woc.idList1 fatcat_bulk_exports_2024-02-18/refcat-brefcombined-2022-01-03.json > woc.fatcat_refcat-bref zcat fatcat_bulk_exports_2024-02-18/container_export.json.gz|grep -Ff woc.idList1 > woc.fatcat.citations.papers
zcat sematicscolar/citations-partjsonl.gz| sed 's|."citingcorpusid":||;s|,"citedcorpusid":|;|;s|,.*||'| grep -v ';null$' | lsort 200G -t; -k1,1 | gzip > toCited.gz
zcat toCited.gz| awk -F; '{print $2";"$1}' | lsort 200G -t; -k1,1 | gzip > fromCited.gz
n=fromCited
zcat $n.gz | perl ~/lookup/connectExportPlain.perl $n
n=fromCited
zcat
perl reRankV3.perl ${n}.crank.map 2> p2PFull.V3.err |gzip > p2PFull.V3.s