-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclass-scraper.py
More file actions
225 lines (182 loc) · 7.4 KB
/
class-scraper.py
File metadata and controls
225 lines (182 loc) · 7.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
### PREAMBLE ###
import pandas as pd
import numpy as np
import sympy
from fractions import Fraction
# before you begin, ensure that the ATLAS html page for M is saved to the same folder,
# as is a json entitled "[group]_class_info.json"
# THINGS THAT MAY NEED EDITING WHEN USED ON DIFFERENT PAGES:
# - the order of the group
# - the relative location of the table of conjugacy classes
# - if "Conjugacy class" and "Power up" has an extra space
# - any file names specific to the group name
def alphanumericNumbers(string): #takes the number from the conjugacy class, eg 3B -> 3
numberToReturn = ""
x = list(string)
for i in x:
if i.isnumeric():
numberToReturn += i
return int(numberToReturn)
def lengthOfClass(GroupOrder,CentralizerOrders):
OutputArray = []
x = list(CentralizerOrders)
for i in x:
if str(i) != "nan":
OutputArray.append(str(int((GroupOrder* 1/Fraction(i)))))
else:
OutputArray.append("-")
return OutputArray
def CentralizerOrder(CentralizerOrder):
OutputArray = []
for i in CentralizerOrder:
if str(i) != "nan":
OutputArray.append(str(i))
else:
OutputArray.append("-")
return OutputArray
def CentralizerShape(CentralizerOrder,Order):
x = len(CentralizerOrder)
OutputArray = []
for i in range(x):
if str(CentralizerOrder[i]) != "nan":
if int(CentralizerOrder[i]) == int(Order[i]):
OutputArray.append(Order[i])
else:
OutputArray.append("-")
else:
OutputArray.append("-")
return OutputArray
def RationalTest(ClassName,ClassOrder,PowerUp):
# Here ClassOrder and PowerUp are arrays
# I am assuming that the minimum power up order is equal to or greater than that of the class order
x = len(ClassOrder)
OutputArray = []
for i in range(x):
a = str(ClassOrder[i])
b = str(PowerUp[i])
c = str(ClassName[i])
if b == "nan":
if c.find("-") == -1:
OutputEntry = True
else:
OutputEntry = "-"
else:
OutputEntry = False
for i in range(len(a)):
if a[i] != b[i]:
OutputEntry = True
OutputArray.append(OutputEntry)
return OutputArray
def RealTest(RationalInfoArray,OrderArray,PowerClassArray):
OutputArray = []
for i in range(len(RationalInfoArray)):
RealTest = False
if RationalInfoArray[i] == True:
RealTest = True
else:
if RationalInfoArray[i] != "-":
if sympy.isprime(OrderArray[i]) == True:
if (OrderArray[i] - 1)/(PowerClassArray[i]) % 2 == 0:
RealTest = True
else:
RealTest = False
else:
RealTest = False
else:
RealTest = "-"
OutputArray.append(RealTest)
return OutputArray
def PowerUpProperForm(PowerUp):
output = ""
if str(PowerUp) == "nan":
output = "-"
else:
x = list(str(PowerUp))
for i in range(len(x)):
y = ""
if x[i].isnumeric(): # ensures a number is in proper form, eg 6A2 -> 6A^2
if i != 0:
if x[i-1].isalpha():
y = "^"+x[i]
else:
y = x[i]
else:
y = x[i]
else:
y = x[i]
if x[i] == " ": # ensures no double spaces, eg "6A 6B" -> "6A 6B"
if x[i-1] == " ":
y = ""
else:
y = " "
output += y
return output
def PowerClasses(ClassName,OrderArray,PowerUpArray):
x = len(OrderArray)
OutputArray = []
for i in range(x):
Order = str(OrderArray[i]) # Labels the order from that specific row
if ClassName[i].find("-") != -1:
OutputArray.append("-")
elif PowerUpArray[i] == "-": #If the power up for that row is null, we append a 1
OutputArray.append(1)
else:
PowerUp = str(PowerUpArray[i]).split() #split that row of the Power Up Array into an array of words
PowerClassSum = 1 # set the number of power classes to 1
for j in range(len(PowerUp)): # goes through each Power Up word one by one
PowerUpWord = PowerUp[j] # set the individual power up word
OrderTest = True # assume the power up has order of the class by default
for k in range(len(Order)): # if the power up order is not equal to that of the class, set the test to False
if Order[k] != PowerUpWord[k]:
OrderTest = False
if OrderTest == True:
if PowerUpWord[len(Order)].isalpha(): # add 1 to the sum if and only if the last character precedes a letter (to avoid eg class 60 counting with a class of 6)
PowerClassSum += 1
OutputArray.append(PowerClassSum)
return OutputArray
def ClassReps(ClassRepArray): # scrapes data from the HTML table
OutputArray = []
for i in ClassRepArray:
if str(i) == "nan":
OutputArray.append("-")
elif str(i) == "Omitted owing to length.":
OutputArray.append("Omitted owing to length.")
else:
OutputArray.append(i)
return OutputArray
f = open("B/ATLAS_ B.html", "r") # scrapes the ATLAS v3 page saved locally
groupOrder = 4154781481226426191177580544000000
### SCRAPING THE HTML ###
df_list = pd.read_html(f) #pandas finds the list of tables within the page
df = df_list[-2] # finds the table of conjugacy classes, which has index -2
df["Centraliser order"] = df["Centraliser order"].str.replace(" ","")
df.to_csv("B/b_atlasSite_conjugacyInfo.csv") # create a CSV for now as it's easier to work with than json
### CREATING MONSTER DF ###
class_df = pd.DataFrame() #create empty class csv
IDArr = df["Conjugacy class"] #note the extra space, blame the scraper
LengthArr = lengthOfClass(groupOrder,df["Centraliser order"])
OrderArr = df["Conjugacy class"].apply(alphanumericNumbers)
CentralizerOrderArr = CentralizerOrder(df["Centraliser order"])
CentralizerShapeArr = CentralizerShape(df["Centraliser order"],OrderArr)
CentralizerSmallGroupArr = "-" # BLANK
RationalArr = RationalTest(IDArr,OrderArr, df["Power up"])
PowerUpArr = df["Power up"].apply(PowerUpProperForm) # again, an extra space here
PowerMapArr = "-" # BLANK
PowerClassArr = PowerClasses(IDArr,OrderArr, PowerUpArr)
RealArr = RealTest(RationalArr,OrderArr,PowerClassArr)
ClassRepArr = ClassReps(df["Class rep(s)"])
class_df["id"] = IDArr
class_df["length"] = LengthArr
class_df["order"] = OrderArr
class_df["centralizer_order"] = CentralizerOrderArr
class_df["centralizer_shape"] = CentralizerShapeArr
class_df["centralizer_small_group"] = CentralizerSmallGroupArr
class_df["rational"] = RationalArr
class_df["real"] = RealArr
class_df["power_up"] = PowerUpArr
class_df["power_map"] = PowerMapArr
class_df["power_classes"] = PowerClassArr
class_df["representative"] = ClassRepArr
class_df.to_csv("B/b_class_info.csv")
class_df.to_json("B/b_class_info_generated.json", orient="records", indent=4)
f.close