bibliometrics/cg_production2.sql at master · NETESOLUTIONS/bibliometrics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
-- Dataset for PARDI Clinical Guidelines
-- User: selects clinical guideline(s) from a list

-- System returns: Files containing:
-- 1. Guideline(s) metadata: Silverchair ID, Title, (if available) WOS ID, PMID, PMCID, WOS or PubMed Citation.
-- 2. First-Generation guideline references (G1): Silverchair ID, WOS ID, PMID, PMCID, Title, Authors, Citation
-- 3. G1 Support: G1 PMID, core_project_num, admin_phs_org_code, match_case, external_org_id, index_name, PI name(s)
-- 4. Second-Generation references (G2): WOS and/or PMID from file 2 and metadata on cited references: WOS ID, PMID, PMCID, Title, Authors, Citation
-- 5. G2 Support, G2 PMID, core_project_num, admin_phs_org_code, match_case, external_org_id, index_name, PI name(s).

-- The following tables are exported to /tmp
-- temp_cg_summary.csv
-- temp_cg_spires.csv
-- temp_cg_citedg1_spires.csv
-- temp_cg_citedg2_spires.csv


select current_timestamp;
set log_temp_files=0;
set temp_tablespaces = 'temp_tbs';

--map clinical guidelines to pmids
drop table if exists temp_cg_pmid;
create table temp_cg_pmid as select a.uid as silverchair_id, b.pmid from cg_uids a LEFT JOIN cg_uid_pmid_mapping b on a.uid=b.uid;

--map clinical guidelines from current inventory that have pmids to wos_uids
drop table if exists temp_cg_pmid_wos;
create table temp_cg_pmid_wos as select a.*,b.wos_uid from cg_uid_pmid_mapping a LEFT JOIN wos_pmid_mapping b on a.pmid::int=b.pmid_int;

-- this step speeds things up YMMV
set enable_seqscan = 'off';

-- map wos_uids to cited_wos_uids
drop table if exists temp_cg_pmid_wos_citedwos_g1;
create table temp_cg_pmid_wos_citedwos_g1 as select a.*,b.cited_source_uid  from temp_cg_pmid_wos a LEFT JOIN wos_references b
on a.wos_uid=b.source_id;

--clean up cited_source_uids
update temp_cg_pmid_wos_citedwos_g1
  set cited_source_uid =
  (
    case when cited_source_uid like 'WOS%'
           then substring(cited_source_uid, 1, 19)
         when cited_source_uid like 'MED%' or cited_source_uid like 'NON%' or
         cited_source_uid like 'CSC%' or cited_source_uid like 'INS%' or
         cited_source_uid like 'BCI%' or cited_source_uid=''
           then cited_source_uid
         else substring('WOS:'||cited_source_uid, 1, 19)
    end
  );

--add seq number to temp_cg_pmid_wos_citedwos_g1 to enable chunking
drop sequence if exists g1_seq;
create sequence g1_seq;
alter table temp_cg_pmid_wos_citedwos_g1  add column g1_seq integer not null default nextval('g1_seq');

-- build index on temp_cg_pmid_wos_citedwos_g1
drop index if exists temp_cg_pmid_idx;
create index temp_cg_pmid_idx on temp_cg_pmid_wos_citedwos_g1(cited_source_uid) tablespace wosindex_tbs;

--map cited_source_uids to second generation cited_source_uids in chunks
drop table if exists temp_cg1;
create table temp_cg1 as
select a.*,b.cited_source_uid as cited_source_uid_g2
from temp_cg_pmid_wos_citedwos_g1 a
LEFT JOIN  wos_references b on a.cited_source_uid=b.source_id
where b.cited_source_uid is not null and a.g1_seq <=10000;

drop table if exists temp_cg2;
create table temp_cg2 as
select a.*,b.cited_source_uid as cited_source_uid_g2
from temp_cg_pmid_wos_citedwos_g1 a
LEFT JOIN  wos_references b on a.cited_source_uid=b.source_id
where b.cited_source_uid is not null and a.g1_seq > 10000 and a.g1_seq <=20000;

drop table if exists temp_cg3;
create table temp_cg3 as
select a.*,b.cited_source_uid as cited_source_uid_g2
from temp_cg_pmid_wos_citedwos_g1 a
LEFT JOIN  wos_references b on a.cited_source_uid=b.source_id
where b.cited_source_uid is not null and a.g1_seq > 20000 and a.g1_seq <=30000;

drop table if exists temp_cg4;
create table temp_cg4 as
select a.*,b.cited_source_uid as cited_source_uid_g2
from temp_cg_pmid_wos_citedwos_g1 a
LEFT JOIN  wos_references b on a.cited_source_uid=b.source_id
where b.cited_source_uid is not null and a.g1_seq > 30000 and a.g1_seq <=40000;

drop table if exists temp_cg5;
create table temp_cg5 as
select a.*,b.cited_source_uid as cited_source_uid_g2
from temp_cg_pmid_wos_citedwos_g1 a
LEFT JOIN  wos_references b on a.cited_source_uid=b.source_id
where b.cited_source_uid is not null and a.g1_seq > 40000 and a.g1_seq <=50000;

drop table if exists temp_cg6;
create table temp_cg6 as
select a.*,b.cited_source_uid as cited_source_uid_g2
from temp_cg_pmid_wos_citedwos_g1 a
LEFT JOIN  wos_references b on a.cited_source_uid=b.source_id
where b.cited_source_uid is not null and a.g1_seq > 50000 and a.g1_seq <=60000;

drop table if exists temp_cg7;
create table temp_cg7 as
select a.*,b.cited_source_uid as cited_source_uid_g2
from temp_cg_pmid_wos_citedwos_g1 a
LEFT JOIN  wos_references b on a.cited_source_uid=b.source_id
where b.cited_source_uid is not null and a.g1_seq > 60000;
select current_timestamp;

drop table if exists temp_cg_pmid_wos_citedwos_g1_g2;
create table temp_cg_pmid_wos_citedwos_g1_g2
as (select * from temp_cg1 union
select * from temp_cg2 union
select * from temp_cg3 union
select * from temp_cg4 union
select * from temp_cg5 union
select * from temp_cg6 union
select * from temp_cg7);

update temp_cg_pmid_wos_citedwos_g1_g2
  set cited_source_uid_g2 =
  (
    case when cited_source_uid_g2 like 'WOS%'
           then substring(cited_source_uid_g2, 1, 19)
         when cited_source_uid_g2 like 'MED%' or cited_source_uid_g2 like 'NON%' or
         cited_source_uid_g2 like 'CSC%' or cited_source_uid_g2 like 'INS%' or
         cited_source_uid_g2 like 'BCI%' or cited_source_uid_g2=''
           then cited_source_uid_g2
         else substring('WOS:'||cited_source_uid_g2, 1, 19)
    end
  );

select current_timestamp;

--clean up
drop table temp_cg1;
drop table temp_cg2;
drop table temp_cg3;
drop table temp_cg4;
drop table temp_cg5;
drop table temp_cg6;
drop table temp_cg7;

--create summary table of mapping counts
drop table if exists temp_cg_summary;
create table temp_cg_summary as
select uid, count(distinct pmid) as pmid,
count(distinct wos_uid) as wos_uid,
count(distinct cited_source_uid) as cited_wos_uid_g1,
count(distinct cited_source_uid_g2) as cited_wos_uid_g2
from temp_cg_pmid_wos_citedwos_g1_g2 group by uid order by uid::int;

-- SPIRES mapping to pmid, cited_wos_uid_g1, and cited_wos_uid_g2

--cg to pmid to spires
drop table if exists temp_cg_working_table;
create table temp_cg_working_table as
select distinct(uid),pmid from temp_cg_pmid_wos_citedwos_g1_g2;

drop table if exists temp_cg_spires;
create table temp_cg_spires as
select a.*,b.core_project_num,b.admin_phs_org_code,b.match_case,b.external_org_id,b.index_name
from temp_cg_working_table a LEFT JOIN spires_pub_projects b on a.pmid::int=b.pmid;

-- cg to cited_source_id_g1 to pmid to spires
drop table if exists temp_cg_working_table;
create table temp_cg_working_table as
select  distinct uid, pmid, cited_source_uid as cited_source_uid_g1
from temp_cg_pmid_wos_citedwos_g1_g2  order by uid;

drop table if exists temp_cg_working_table2;
create table temp_cg_working_table2 as
select a.*,b.pmid_int from temp_cg_working_table a
INNER JOIN wos_pmid_mapping b on a.cited_source_uid_g1=b.wos_uid;

drop table if exists temp_cg_citedg1_spires;
create table temp_cg_citedg1_spires as
select a.*,b.core_project_num,b.admin_phs_org_code,b.match_case,b.external_org_id,b.index_name
from temp_cg_working_table2 a INNER JOIN spires_pub_projects b on a.pmid_int=b.pmid;

--cg to cited_source_id_g2 to pmid to spires
drop table if exists temp_cg_working_table;
create table temp_cg_working_table as
select  distinct uid, pmid, cited_source_uid_g2
from temp_cg_pmid_wos_citedwos_g1_g2  order by uid;

drop table if exists temp_cg_working_table2;
create table temp_cg_working_table2 as
select a.*,b.pmid_int from temp_cg_working_table a
INNER JOIN wos_pmid_mapping b on a.cited_source_uid_g2=b.wos_uid;

drop table if exists temp_cg_citedg2_spires;
create table temp_cg_citedg2_spires as
select a.*,b.core_project_num,b.admin_phs_org_code,b.match_case,b.external_org_id,b.index_name
from temp_cg_working_table2 a INNER JOIN spires_pub_projects b on a.pmid_int=b.pmid;


copy(select * from temp_cg_summary) to '/tmp/temp_cg_summary.csv' CSV HEADER DELIMITER ',';
copy(select * from temp_cg_spires) to '/tmp/temp_cg_spires.csv' CSV HEADER DELIMITER ',';
copy(select * from temp_cg_citedg1_spires) to '/tmp/temp_cg_citedg1_spires.csv' CSV HEADER DELIMITER ',';
copy(select * from temp_cg_citedg2_spires) to '/tmp/temp_cg_citedg2_spires.csv' CSV HEADER DELIMITER ',';

select current_timestamp;

--clean up
drop table if exists temp_cg_working_table;
drop table if exists temp_cg_working_table2;