-
Notifications
You must be signed in to change notification settings - Fork 268
Expand file tree
/
Copy pathagent.py
More file actions
3106 lines (2601 loc) · 141 KB
/
agent.py
File metadata and controls
3106 lines (2601 loc) · 141 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
#
# AzureMonitoringLinuxAgent Extension
#
# Copyright 2021 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import sys
import os
import os.path
import datetime
import signal
import pwd
import glob
import grp
import re
import filecmp
import stat
import traceback
import time
import platform
import subprocess
import json
import base64
import inspect
import shutil
import hashlib
import fileinput
import contextlib
import ama_tst.modules.install.supported_distros as supported_distros
from collections import OrderedDict
from hashlib import sha256
from shutil import copyfile, rmtree, copytree, copy2
from threading import Thread
import telegraf_utils.telegraf_config_handler as telhandler
import metrics_ext_utils.metrics_constants as metrics_constants
import metrics_ext_utils.metrics_ext_handler as me_handler
import metrics_ext_utils.metrics_common_utils as metrics_utils
try:
import urllib.request as urllib # Python 3+
except ImportError:
import urllib2 as urllib # Python 2
try:
from urllib.parse import urlparse # Python 3+
except ImportError:
from urlparse import urlparse # Python 2
try:
import urllib.error as urlerror # Python 3+
except ImportError:
import urllib2 as urlerror # Python 2
# python shim can only make IMDS calls which shouldn't go through proxy
try:
urllib.getproxies = lambda x = None: {}
except Exception as e:
print('Resetting proxies failed with error: {0}'.format(e))
try:
from Utils.WAAgentUtil import waagent
import Utils.HandlerUtil as HUtil
except Exception as e:
# These utils have checks around the use of them; this is not an exit case
print('Importing utils failed with error: {0}'.format(e))
# This code is taken from the omsagent's extension wrapper.
# This same monkey patch fix is relevant for AMA extension as well.
# This monkey patch duplicates the one made in the waagent import above.
# It is necessary because on 2.6, the waagent monkey patch appears to be overridden
# by the python-future subprocess.check_output backport.
if sys.version_info < (2,7):
def check_output(*popenargs, **kwargs):
r"""Backport from subprocess module from python 2.7"""
if 'stdout' in kwargs:
raise ValueError('stdout argument not allowed, it will be overridden.')
process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
output, unused_err = process.communicate()
retcode = process.poll()
if retcode:
cmd = kwargs.get("args")
if cmd is None:
cmd = popenargs[0]
raise subprocess.CalledProcessError(retcode, cmd, output=output)
return output
# Exception classes used by this module.
class CalledProcessError(Exception):
def __init__(self, returncode, cmd, output=None):
self.returncode = returncode
self.cmd = cmd
self.output = output
def __str__(self):
return "Command '%s' returned non-zero exit status %d" % (self.cmd, self.returncode)
subprocess.check_output = check_output
subprocess.CalledProcessError = CalledProcessError
# Global Variables
PackagesDirectory = 'packages'
# The BundleFileName values will be replaced by actual values in the release pipeline. See apply_version.sh.
BundleFileNameDeb = 'azuremonitoragent.deb'
BundleFileNameRpm = 'azuremonitoragent.rpm'
BundleFileName = ''
TelegrafBinName = 'telegraf'
InitialRetrySleepSeconds = 30
PackageManager = ''
PackageManagerOptions = ''
MdsdCounterJsonPath = '/etc/opt/microsoft/azuremonitoragent/config-cache/metricCounters.json'
FluentCfgPath = '/etc/opt/microsoft/azuremonitoragent/config-cache/fluentbit/td-agent.conf'
AMASyslogConfigMarkerPath = '/etc/opt/microsoft/azuremonitoragent/config-cache/syslog.marker'
AMASyslogPortFilePath = '/etc/opt/microsoft/azuremonitoragent/config-cache/syslog.port'
AMAFluentPortFilePath = '/etc/opt/microsoft/azuremonitoragent/config-cache/fluent.port'
PreviewFeaturesDirectory = '/etc/opt/microsoft/azuremonitoragent/config-cache/previewFeatures/'
ArcSettingsFile = '/var/opt/azcmagent/localconfig.json'
AMAAstTransformConfigMarkerPath = '/etc/opt/microsoft/azuremonitoragent/config-cache/agenttransform.marker'
AMAExtensionLogRotateFilePath = '/etc/logrotate.d/azuremonitoragentextension'
WAGuestAgentLogRotateFilePath = '/etc/logrotate.d/waagent-extn.logrotate'
SupportedArch = set(['x86_64', 'aarch64'])
# Error codes
GenericErrorCode = 1
UnsupportedOperatingSystem = 51
IndeterminateOperatingSystem = 51
MissingorInvalidParameterErrorCode = 53
DPKGOrRPMLockedErrorCode = 56
MissingDependency = 52
# Settings
GenevaConfigKey = "genevaConfiguration"
AzureMonitorConfigKey = "azureMonitorConfiguration"
# Configuration
HUtilObject = None
SettingsSequenceNumber = None
HandlerEnvironment = None
SettingsDict = None
def main():
"""
Main method
Parse out operation from argument, invoke the operation, and finish.
"""
init_waagent_logger()
waagent_log_info('Azure Monitoring Agent for Linux started to handle.')
# Determine the operation being executed
operation = None
try:
option = sys.argv[1]
if re.match('^([-/]*)(disable)', option):
operation = 'Disable'
elif re.match('^([-/]*)(uninstall)', option):
operation = 'Uninstall'
elif re.match('^([-/]*)(install)', option):
operation = 'Install'
elif re.match('^([-/]*)(enable)', option):
operation = 'Enable'
elif re.match('^([-/]*)(update)', option):
operation = 'Update'
elif re.match('^([-/]*)(metrics)', option):
operation = 'Metrics'
elif re.match('^([-/]*)(syslogconfig)', option):
operation = 'Syslogconfig'
elif re.match('^([-/]*)(transformconfig)', option):
operation = 'Transformconfig'
except Exception as e:
waagent_log_error(str(e))
if operation is None:
log_and_exit('Unknown', GenericErrorCode, 'No valid operation provided')
# Set up for exit code and any error messages
exit_code = 0
message = '{0} succeeded'.format(operation)
# Avoid entering broken state where manual purge actions are necessary in low disk space scenario
destructive_operations = ['Disable', 'Uninstall']
if operation not in destructive_operations:
exit_code = check_disk_space_availability()
if exit_code != 0:
message = '{0} failed due to low disk space'.format(operation)
log_and_exit(operation, exit_code, message)
# Invoke operation
try:
global HUtilObject
HUtilObject = parse_context(operation)
exit_code, output = operations[operation]()
# Exit code 1 indicates a general problem that doesn't have a more
# specific error code; it often indicates a missing dependency
if exit_code == 1 and operation == 'Install':
message = 'Install failed with exit code 1. For error details, check logs ' \
'in /var/log/azure/Microsoft.Azure.Monitor' \
'.AzureMonitorLinuxAgent'
elif exit_code is DPKGOrRPMLockedErrorCode and operation == 'Install':
message = 'Install failed with exit code {0} because the ' \
'package manager on the VM is currently locked: ' \
'please wait and try again'.format(DPKGOrRPMLockedErrorCode)
elif exit_code != 0:
message = '{0} failed with exit code {1} {2}'.format(operation,
exit_code, output)
except AzureMonitorAgentForLinuxException as e:
exit_code = e.error_code
message = e.get_error_message(operation)
except Exception as e:
exit_code = GenericErrorCode
message = '{0} failed with error: {1}\n' \
'Stacktrace: {2}'.format(operation, e,
traceback.format_exc())
# Finish up and log messages
log_and_exit(operation, exit_code, message)
def check_disk_space_availability():
"""
Check if there is the required space on the machine.
"""
try:
if get_free_space_mb("/var") < 500 or get_free_space_mb("/etc") < 500 or get_free_space_mb("/opt") < 500 :
# 52 is the exit code for missing dependency i.e. disk space
# https://github.com/Azure/azure-marketplace/wiki/Extension-Build-Notes-Best-Practices#error-codes-and-messages-output-to-stderr
return MissingDependency
else:
return 0
except:
print('Failed to check disk usage.')
return 0
def get_free_space_mb(dirname):
"""
Get the free space in MB in the directory path.
"""
st = os.statvfs(dirname)
return (st.f_bavail * st.f_frsize) // (1024 * 1024)
def is_systemd():
"""
Check if the system is using systemd
"""
return os.path.isdir("/run/systemd/system")
def get_service_command(service, *operations):
"""
Get the appropriate service command [sequence] for the provided service name and operation(s)
"""
if is_systemd():
return " && ".join(["systemctl {0} {1}".format(operation, service) for operation in operations])
else:
hutil_log_info("The VM doesn't have systemctl. Using the init.d service to start {0}.".format(service))
return '/etc/init.d/{0} {1}'.format(service, operations[0])
def check_kill_process(pstring):
for line in os.popen("ps ax | grep " + pstring + " | grep -v grep"):
fields = line.split()
pid = fields[0]
os.kill(int(pid), signal.SIGKILL)
def compare_and_copy_bin(src, dest):
# Check if previous file exist at the location, compare the two binaries,
# If the files are not same, remove the older file, and copy the new one
# If they are the same, then we ignore it and don't copy
if os.path.isfile(src ):
if os.path.isfile(dest):
if not filecmp.cmp(src, dest):
# Removing the file in case it is already being run in a process,
# in which case we can get an error "text file busy" while copying
os.remove(dest)
copyfile(src, dest)
else:
# No previous binary exist, simply copy it and make it executable
copyfile(src, dest)
os.chmod(dest, stat.S_IXGRP | stat.S_IRGRP | stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IXOTH | stat.S_IROTH)
def set_metrics_binaries():
current_arch = platform.machine()
# Rename the Arch appropriate metrics extension binary to MetricsExtension
MetricsExtensionDir = os.path.join(os.getcwd(), 'MetricsExtensionBin')
SupportedMEPath = os.path.join(MetricsExtensionDir, 'metricsextension_'+current_arch)
if os.path.exists(SupportedMEPath):
os.rename(SupportedMEPath, os.path.join(MetricsExtensionDir, 'MetricsExtension'))
# Cleanup unused ME binaries
for f in os.listdir(MetricsExtensionDir):
if f != 'MetricsExtension':
os.remove(os.path.join(MetricsExtensionDir, f))
def copy_amacoreagent_binaries():
current_arch = platform.machine()
amacoreagent_bin_local_path = os.getcwd() + "/amaCoreAgentBin/amacoreagent_" + current_arch
amacoreagent_bin = "/opt/microsoft/azuremonitoragent/bin/amacoreagent"
compare_and_copy_bin(amacoreagent_bin_local_path, amacoreagent_bin)
if current_arch == 'x86_64':
libgrpc_bin_local_path = os.getcwd() + "/amaCoreAgentBin/libgrpc_csharp_ext.x64.so"
libgrpc_bin = "/opt/microsoft/azuremonitoragent/bin/libgrpc_csharp_ext.x64.so"
compare_and_copy_bin(libgrpc_bin_local_path, libgrpc_bin)
liblz4x64_bin_local_path = os.getcwd() + "/amaCoreAgentBin/liblz4x64.so"
liblz4x64_bin = "/opt/microsoft/azuremonitoragent/bin/liblz4x64.so"
compare_and_copy_bin(liblz4x64_bin_local_path, liblz4x64_bin)
elif current_arch == 'aarch64':
libgrpc_bin_local_path = os.getcwd() + "/amaCoreAgentBin/libgrpc_csharp_ext.arm64.so"
libgrpc_bin = "/opt/microsoft/azuremonitoragent/bin/libgrpc_csharp_ext.arm64.so"
compare_and_copy_bin(libgrpc_bin_local_path, libgrpc_bin)
agentlauncher_bin_local_path = os.getcwd() + "/agentLauncherBin/agentlauncher_" + current_arch
agentlauncher_bin = "/opt/microsoft/azuremonitoragent/bin/agentlauncher"
compare_and_copy_bin(agentlauncher_bin_local_path, agentlauncher_bin)
def copy_mdsd_fluentbit_binaries():
current_arch = platform.machine()
mdsd_bin_local_path = os.getcwd() + "/mdsdBin/mdsd_" + current_arch
mdsdmgr_bin_local_path = os.getcwd() + "/mdsdBin/mdsdmgr_" + current_arch
fluentbit_bin_local_path = os.getcwd() + "/fluentBitBin/fluent-bit_" + current_arch
mdsd_bin = "/opt/microsoft/azuremonitoragent/bin/mdsd"
mdsdmgr_bin = "/opt/microsoft/azuremonitoragent/bin/mdsdmgr"
fluentbit_bin = "/opt/microsoft/azuremonitoragent/bin/fluent-bit"
# copy the required libs to our test directory first
lib_dir = os.path.join(os.getcwd(), "lib")
if os.path.exists(lib_dir):
rmtree(lib_dir)
if sys.version_info >= (3, 8):
# dirs_exist_ok parameter was added in Python 3.8
copytree("/opt/microsoft/azuremonitoragent/lib", lib_dir, dirs_exist_ok=True)
else:
copytree("/opt/microsoft/azuremonitoragent/lib", lib_dir)
canUseSharedmdsd, _ = run_command_and_log('ldd ' + mdsd_bin_local_path + ' | grep "not found"')
canUseSharedmdsdmgr, _ = run_command_and_log('ldd ' + mdsdmgr_bin_local_path + ' | grep "not found"')
if canUseSharedmdsd != 0 and canUseSharedmdsdmgr != 0:
compare_and_copy_bin(mdsd_bin_local_path, mdsd_bin)
compare_and_copy_bin(mdsdmgr_bin_local_path, mdsdmgr_bin)
canUseSharedfluentbit, _ = run_command_and_log('ldd ' + fluentbit_bin_local_path + ' | grep "not found"')
if canUseSharedfluentbit != 0:
compare_and_copy_bin(fluentbit_bin_local_path, fluentbit_bin)
rmtree(os.getcwd() + "/lib")
def get_installed_package_version():
"""
Returns if Azure Monitor Agent is installed and a list of installed version of the Azure Monitor Agent package.
Returns: (is_installed, version_list)
"""
if PackageManager == "dpkg":
# In the case of dpkg, we specify only Package and Version as architecture is written as amd64/arm64 instead of x86_64/aarch64.
cmd = "dpkg-query -W -f='${Package}_${Version}\n' 'azuremonitoragent*' 2>/dev/null"
elif PackageManager == "rpm":
cmd = "rpm -q azuremonitoragent"
else:
hutil_log_error("Could not determine package manager.")
return False, []
exit_code, output = run_command_and_log(cmd, check_error=False)
if exit_code != 0 or not output:
hutil_log_info("Azure Monitor Agent package not found after running {0}.".format(cmd))
return False, []
version_list = output.strip().split('\n')
return True, version_list
def get_current_bundle_file():
if PackageManager == 'dpkg':
return BundleFileNameDeb.rsplit('.deb', 1)[0] # Remove .deb extension
elif PackageManager == 'rpm':
return BundleFileNameRpm.rsplit('.rpm', 1)[0] # Remove .rpm extension
return ""
def install():
"""
Ensure that this VM distro and version are supported.
Install the Azure Monitor Linux Agent package, using retries.
Note: install operation times out from WAAgent at 15 minutes, so do not
wait longer.
"""
exit_if_vm_not_supported('Install')
find_package_manager("Install")
set_os_arch('Install')
vm_dist, vm_ver = find_vm_distro('Install')
# Check if Debian 12 VMs have rsyslog package (required for AMA 1.31+)
if (vm_dist.startswith('debian')) and vm_ver.startswith('12'):
check_rsyslog, _ = run_command_and_log("dpkg -s rsyslog")
if check_rsyslog != 0:
hutil_log_info("'rsyslog' package missing from Debian 12 machine, installing to allow AMA to run.")
rsyslog_exit_code, rsyslog_output = run_command_and_log("DEBIAN_FRONTEND=noninteractive apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y rsyslog")
if rsyslog_exit_code != 0:
return rsyslog_exit_code, rsyslog_output
# Check if Amazon 2023 VMs have rsyslog package (required for AMA 1.31+)
if (vm_dist.startswith('amzn')) and vm_ver.startswith('2023'):
check_rsyslog, _ = run_command_and_log("dnf list installed | grep rsyslog.x86_64")
if check_rsyslog != 0:
hutil_log_info("'rsyslog' package missing from Amazon Linux 2023 machine, installing to allow AMA to run.")
rsyslog_exit_code, rsyslog_output = run_command_and_log("dnf install -y rsyslog")
if rsyslog_exit_code != 0:
return rsyslog_exit_code, rsyslog_output
# Flag to handle the case where the same package is already installed
same_package_installed = False
# Check if the package is already installed with the correct version
is_installed, installed_versions = get_installed_package_version()
# Check if the package is already installed, if so determine if it is the same as the bundle or not
if is_installed:
hutil_log_info("Found installed azuremonitoragent version(s): {0}".format(installed_versions))
# Check if already have this version of AMA installed, if so, no-op for install of AMA
if len(installed_versions) == 1:
current_bundle = get_current_bundle_file()
hutil_log_info("Current bundle file: {0}".format(current_bundle))
package_name = installed_versions[0]
# This is to make sure dpkg's package name is in the same format as the BundleFileNameDeb
if PackageManager == 'dpkg':
architecture = ''
if platform.machine() == 'x86_64':
architecture = '_x86_64'
elif platform.machine() == 'aarch64':
architecture = '_aarch64'
# need to change the ending from amd64 to x86_64 and arm64 to aarch64
package_name = package_name + architecture
if current_bundle == package_name:
hutil_log_info("This version of azuremonitoragent package is already installed. Skipping package install.")
same_package_installed = True
else:
hutil_log_error("Multiple versions of azuremonitoragent package found: {0}\n This is undefined behavior, we recommend running the following:".format(installed_versions))
if PackageManager == 'dpkg':
hutil_log_error("Run the following command first:\n dpkg --purge azuremonitoragent. If this does not work try the following with caution:\n"
"'rm /var/lib/dpkg/info/azuremonitoragent.*' followed by 'dpkg --force-all -P azuremonitoragent'")
elif PackageManager == 'rpm': # For reference AzureLinux 3.0 also falls under this category
hutil_log_error("Run the following command first: ")
hutil_log_error("'rpm -q azuremonitoragent' and for each version run: rpm -e azuremonitoragent-(version)-(bundle_number).(architecture), or rpm -e --deleteall azuremonitoragent\n An example of the command is as follows: rpm -e {0}".format(installed_versions[0]))
hutil_log_error("If the following does not work please try the following: rpm -e --noscripts --nodeps azuremonitoragent-(version)-(bundle_number).(architecture). I.e. rpm -e --noscripts --nodeps {0}".format(installed_versions[0]))
# If the same bundle of Azure Monitor Agent package is not already installed, proceed with installation
if not same_package_installed:
hutil_log_info("Installing Azure Monitor Agent package.")
package_directory = os.path.join(os.getcwd(), PackagesDirectory)
bundle_path = os.path.join(package_directory, BundleFileName)
os.chmod(bundle_path, 100)
print(PackageManager, " and ", BundleFileName)
AMAInstallCommand = "{0} {1} -i {2}".format(PackageManager, PackageManagerOptions, bundle_path)
hutil_log_info('Running command "{0}"'.format(AMAInstallCommand))
# Try to install with retry, since install can fail due to concurrent package operations
exit_code, output = run_command_with_retries_output(AMAInstallCommand, retries = 15,
retry_check = retry_if_dpkg_or_rpm_locked,
final_check = final_check_if_dpkg_or_rpm_locked)
# Retry install for aarch64 rhel8 VMs as initial install fails to create symlink to /etc/systemd/system/azuremonitoragent.service
# in /etc/systemd/system/multi-user.target.wants/azuremonitoragent.service
if vm_dist.replace(' ','').lower().startswith('redhat') and vm_ver == '8.6' and platform.machine() == 'aarch64':
exit_code, output = run_command_with_retries_output(AMAInstallCommand, retries = 15,
retry_check = retry_if_dpkg_or_rpm_locked,
final_check = final_check_if_dpkg_or_rpm_locked)
if exit_code != 0:
return exit_code, output
# System daemon reload is required for systemd to pick up the new service
exit_code, output = run_command_and_log("systemctl daemon-reload")
if exit_code != 0:
return exit_code, output
# Copy the AMACoreAgent and agentlauncher binaries
copy_amacoreagent_binaries()
set_metrics_binaries()
# Copy KqlExtension binaries
# Needs to be revisited for aarch64
copy_kqlextension_binaries()
# Copy mdsd and fluent-bit with OpenSSL dynamically linked
if is_feature_enabled('useDynamicSSL'):
# Check if they have libssl.so.1.1 since AMA is built against this version
libssl1_1, _ = run_command_and_log('ldconfig -p | grep libssl.so.1.1')
if libssl1_1 == 0:
copy_mdsd_fluentbit_binaries()
# Set task limits to max of 65K in suse 12
# Based on Task 9764411: AMA broken after 1.7 in sles 12 - https://dev.azure.com/msazure/One/_workitems/edit/9764411
vm_dist, _ = find_vm_distro('Install')
if (vm_dist.startswith('suse') or vm_dist.startswith('sles')):
try:
suse_exit_code, suse_output = run_command_and_log("mkdir -p /etc/systemd/system/azuremonitoragent.service.d")
if suse_exit_code != 0:
return suse_exit_code, suse_output
suse_exit_code, suse_output = run_command_and_log("echo '[Service]' > /etc/systemd/system/azuremonitoragent.service.d/override.conf")
if suse_exit_code != 0:
return suse_exit_code, suse_output
suse_exit_code, suse_output = run_command_and_log("echo 'TasksMax=65535' >> /etc/systemd/system/azuremonitoragent.service.d/override.conf")
if suse_exit_code != 0:
return suse_exit_code, suse_output
suse_exit_code, suse_output = run_command_and_log("systemctl daemon-reload")
if suse_exit_code != 0:
return suse_exit_code, suse_output
except:
log_and_exit("install", MissingorInvalidParameterErrorCode, "Failed to update /etc/systemd/system/azuremonitoragent.service.d for suse 12,15" )
return 0, "Azure Monitor Agent package installed successfully"
def uninstall():
"""
Uninstall the Azure Monitor Linux Agent.
Whether it is a purge of all files or preserve of log files depends on the uninstall context file.
Note: uninstall operation times out from WAAgent at 5 minutes
"""
exit_if_vm_not_supported('Uninstall')
find_package_manager("Uninstall")
# Before we uninstall, we need to ensure AMA is installed to begin with
is_installed, installed_versions = get_installed_package_version()
if not is_installed:
hutil_log_info("Azure Monitor Agent is not installed, nothing to uninstall.")
return 0, "Azure Monitor Agent is not installed, nothing to uninstall."
if PackageManager != "dpkg" and PackageManager != "rpm":
log_and_exit("Uninstall", UnsupportedOperatingSystem, "The OS has neither rpm nor dpkg." )
# Try a specific package uninstall for rpm
if PackageManager == "rpm":
purge_cmd_template = "rpm -e {0}"
# Process each package
for package_name in installed_versions:
if not package_name.strip():
continue
package_name = package_name.strip()
# Clean the package name and create uninstall command
uninstall_command = purge_cmd_template.format(package_name)
hutil_log_info("Removing package: {0} by running {1}".format(package_name, uninstall_command))
# Execute uninstall command with retries
exit_code, output = run_command_with_retries_output(
uninstall_command,
retries=4,
retry_check=retry_if_dpkg_or_rpm_locked,
final_check=final_check_if_dpkg_or_rpm_locked
)
elif PackageManager == "dpkg":
AMAUninstallCommand = "dpkg -P azuremonitoragent"
hutil_log_info("Removing package: azuremonitoragent by running {0}".format(AMAUninstallCommand))
exit_code, output = run_command_with_retries_output(
AMAUninstallCommand,
retries=4,
retry_check=retry_if_dpkg_or_rpm_locked,
final_check=final_check_if_dpkg_or_rpm_locked
)
remove_localsyslog_configs()
uninstall_azureotelcollector()
# remove the logrotate config
if os.path.exists(AMAExtensionLogRotateFilePath):
try:
os.remove(AMAExtensionLogRotateFilePath)
except Exception as ex:
output = 'Logrotate removal failed with error: {0}\n' \
'Stacktrace: {1}'.format(ex, traceback.format_exc())
hutil_log_info(output)
# Retry, since uninstall can fail due to concurrent package operations
try:
exit_code, output = force_uninstall_azure_monitor_agent()
# Remove all files installed by the package that were listed by the azuremonitor spec
_remove_package_files_from_list()
except Exception as ex:
exit_code = GenericErrorCode
output = 'Uninstall failed with error: {0}\n' \
'Stacktrace: {1}'.format(ex, traceback.format_exc())
return exit_code, output
def force_uninstall_azure_monitor_agent():
"""
Force uninstall the Azure Monitor Linux Agent package with possibility of multiple existing Azure Monitor Agent Linux packages.
Just for rpm ,this function will attempt to uninstall each package in the installed_versions list.
If it still persists, a force uninstall is done.
Returns: (exit_code, output_message or installed_versions (list of remaining packages))
"""
# Check if azuremonitoragent is still installed, exit code will be non-zero if it is not.
is_installed, remaining_packages = get_installed_package_version()
commands_used = []
if is_installed:
# Since the previous uninstall failed we are going down the route of uninstall without dep and pre/post
hutil_log_info("Initial uninstall command did not remove all packages. Remaining packages: {0}".format(remaining_packages))
AMAUninstallCommandForce = ""
if PackageManager == "dpkg":
# we can remove the post and pre scripts first then purge
RemoveScriptsCommand = "rm /var/lib/dpkg/info/azuremonitoragent.*"
run_command_with_retries_output(RemoveScriptsCommand, retries = 4,
retry_check = retry_if_dpkg_or_rpm_locked,
final_check = final_check_if_dpkg_or_rpm_locked)
AMAUninstallCommandForce = "dpkg --force-all -P azuremonitoragent"
hutil_log_info('Running command "{0}"'.format(AMAUninstallCommandForce))
exit_code, output = run_command_with_retries_output(AMAUninstallCommandForce, retries = 4,
retry_check = retry_if_dpkg_or_rpm_locked,
final_check = final_check_if_dpkg_or_rpm_locked)
commands_used.extend([RemoveScriptsCommand, AMAUninstallCommandForce])
elif PackageManager == "rpm":
# First try to mass uninstall AMA by using the --allmatches flag for rpm
# This is a more robust version of uninstall() since it uses the --allmatches flag
AMAUninstallCommand = "rpm -e --allmatches azuremonitoragent"
hutil_log_info('Running command "{0}"'.format(AMAUninstallCommand))
exit_code, output = run_command_with_retries_output(AMAUninstallCommand, retries = 4,
retry_check = retry_if_dpkg_or_rpm_locked,
final_check = final_check_if_dpkg_or_rpm_locked)
hutil_log_info("Force uninstall command {0} returned exit code {1} and output: {2}".format(AMAUninstallCommandForce, exit_code, output))
commands_used.append(AMAUninstallCommand)
# Query to see what is left after using the --allmatches uninstall
is_still_installed, remaining_packages = get_installed_package_version()
# If the above command fails, we will try to force uninstall each package by using the --noscripts and --nodeps flags
if is_still_installed:
hutil_log_info("Failed to uninstall azuremonitoragent with --allmatches, trying to force uninstall each package individually.")
# --noscripts and --nodeps flags are used to avoid running any pre/post scripts and skip dependencies test
# https://jfearn.fedorapeople.org/en-US/RPM/4/html/RPM_Guide/ch03s03s03.html
for package in remaining_packages:
# Clean the package name and create uninstall command
package = package.strip()
if not package:
continue
AMAUninstallCommandForce = "rpm -e --noscripts --nodeps {0}".format(package)
commands_used.append(AMAUninstallCommandForce)
hutil_log_info('Running command "{0}"'.format(AMAUninstallCommandForce))
exit_code, output = run_command_with_retries_output(AMAUninstallCommandForce, retries = 4,
retry_check = retry_if_dpkg_or_rpm_locked,
final_check = final_check_if_dpkg_or_rpm_locked)
hutil_log_info("Force uninstall command {0} returned exit code {1} and output: {2}".format(AMAUninstallCommandForce, exit_code, output))
# Check if packages are still installed
is_still_installed, remaining_packages = get_installed_package_version()
if is_still_installed:
output = "Force uninstall did not remove all packages, remaining packages: {0}".format(remaining_packages)
hutil_log_info("Force uninstall did not remove all packages, remaining packages: {0}".format(remaining_packages))
return 1, output
else:
hutil_log_info("Force uninstall removed all packages successfully after using: {0}".format(", ".join(commands_used)))
return 0, "Azure Monitor Agent packages uninstalled successfully after using: {0}".format(", ".join(commands_used))
# Since there was no indication of AMA, we can assume it was uninstalled successfully
else:
hutil_log_info("Azure Monitor Agent has been uninstalled.")
return 0, "Azure Monitor Agent has been uninstalled."
def _get_package_files_for_cleanup():
"""
Get the list of files and directories installed by the provided
azuremonitoragent spec that should be removed during uninstall.
This must be called BEFORE the package is uninstalled to ensure the package
manager still has the file list available.
Returns:
tuple: (files_list, directories_to_add) where files_list contains package files
and directories_to_add contains directories that need explicit cleanup
"""
try:
# Get list of files installed by the package
if PackageManager == "dpkg":
# For Debian-based systems
cmd = "dpkg -L azuremonitoragent"
elif PackageManager == "rpm":
# For RPM-based systems
cmd = "rpm -ql azuremonitoragent"
else:
hutil_log_info("Unknown package manager, cannot list package files")
return []
exit_code, output = run_command_and_log(cmd, check_error=False)
if exit_code != 0 or not output:
hutil_log_info("Could not get package file list for cleanup")
return []
# Parse the file list
files = [line.strip() for line in output.strip().split('\n') if line.strip()]
# Collect all azuremonitor-related paths
azuremonitoragent_files = []
for file_path in files:
# Only include files/directories that have "azuremonitor" in their path
# This covers both "azuremonitoragent" and "azuremonitor-*" service files
if "azuremonitor" in file_path:
azuremonitoragent_files.append(file_path)
else:
hutil_log_info("Skipping non-azuremonitor path: {0}".format(file_path))
return azuremonitoragent_files
except Exception as ex:
hutil_log_error("Error gathering package files for cleanup: {0}\n Is Azure Monitor Agent Installed?".format(ex))
return []
def _remove_package_files_from_list():
"""
Remove all files and directories that were installed by the azuremonitoragent spec
which is retrieved as a list of files from _get_package_files_for_cleanup().
"""
try:
package_files_for_cleanup = []
hutil_log_info("Gathering package file list for Azure Monitor Agent installed via {0}".format(PackageManager))
package_files_for_cleanup = _get_package_files_for_cleanup()
if not package_files_for_cleanup:
hutil_log_info("No package files provided for removal. Exiting without removing any files.")
return
# Build consolidated list of paths to clean up
cleanup_paths = set(package_files_for_cleanup) if package_files_for_cleanup else set()
# Add directories that need explicit cleanup since on rpm systems
# the initial list for this path does not remove the directories and files
cleanup_paths.add("/opt/microsoft/azuremonitoragent/")
# Sort paths by depth (deepest first) to avoid removing parent before children
sorted_paths = sorted(cleanup_paths, key=lambda x: x.count('/'), reverse=True)
hutil_log_info("Removing {0} azuremonitor paths".format(len(sorted_paths)))
items_removed = 0
for item_path in sorted_paths:
try:
if os.path.exists(item_path):
if os.path.isdir(item_path):
rmtree(item_path)
hutil_log_info("Removed directory: {0}".format(item_path))
else:
os.remove(item_path)
hutil_log_info("Removed file: {0}".format(item_path))
items_removed += 1
except Exception as ex:
hutil_log_info("Failed to remove {0}: {1}".format(item_path, ex))
hutil_log_info("Removed {0} items total".format(items_removed))
except Exception as ex:
hutil_log_error("Error during file removal from list: {0}\n Were these files removed already?".format(ex))
def enable():
"""
Start the Azure Monitor Linux Agent Service
This call will return non-zero or throw an exception if
the settings provided are incomplete or incorrect.
Note: enable operation times out from WAAgent at 5 minutes
"""
public_settings, protected_settings = get_settings()
exit_if_vm_not_supported('Enable')
ensure = OrderedDict([
("azuremonitoragent", False),
("azuremonitoragentmgr", False)
])
# Set traceFlags in publicSettings to enable mdsd tracing. For example, the EventIngest flag can be enabled via "traceFlags": "0x2"
flags = ""
if public_settings is not None and "traceFlags" in public_settings:
flags = "-T {} ".format(public_settings.get("traceFlags"))
# Use an Ordered Dictionary to ensure MDSD_OPTIONS (and other dependent variables) are written after their dependencies
default_configs = OrderedDict([
("MDSD_CONFIG_DIR", "/etc/opt/microsoft/azuremonitoragent"),
("MDSD_LOG_DIR", "/var/opt/microsoft/azuremonitoragent/log"),
("MDSD_ROLE_PREFIX", "/run/azuremonitoragent/default"),
("MDSD_SPOOL_DIRECTORY", "/var/opt/microsoft/azuremonitoragent"),
("MDSD_OPTIONS", "\"{}-A -R -c /etc/opt/microsoft/azuremonitoragent/mdsd.xml -d -r $MDSD_ROLE_PREFIX -S $MDSD_SPOOL_DIRECTORY/eh -L $MDSD_SPOOL_DIRECTORY/events\"".format(flags)),
("MDSD_USE_LOCAL_PERSISTENCY", "true"),
("MDSD_TCMALLOC_RELEASE_FREQ_SEC", "1"),
("MONITORING_USE_GENEVA_CONFIG_SERVICE", "false"),
("ENABLE_MCS", "false")
])
ssl_cert_var_name, ssl_cert_var_value = get_ssl_cert_info('Enable')
default_configs[ssl_cert_var_name] = ssl_cert_var_value
"""
Decide the mode and configuration. There are two supported configuration schema, mix-and-match between schemas is disallowed:
Legacy: allows one of [MCS, GCS single tenant, or GCS multi tenant ("Auto-Config")] modes
Next-Generation: allows MCS, GCS multi tenant, or both
"""
is_gcs_single_tenant = False
GcsEnabled, McsEnabled = get_control_plane_mode()
# Next-generation schema
if public_settings is not None and (public_settings.get(GenevaConfigKey) or public_settings.get(AzureMonitorConfigKey)):
geneva_configuration = public_settings.get(GenevaConfigKey)
azure_monitor_configuration = public_settings.get(AzureMonitorConfigKey)
# Check for mix-and match of next-generation and legacy schema content
if len(public_settings) > 1 and ((geneva_configuration and not azure_monitor_configuration) or (azure_monitor_configuration and not geneva_configuration)):
log_and_exit("Enable", MissingorInvalidParameterErrorCode, 'Mixing genevaConfiguration or azureMonitorConfiguration with other configuration schemas is not allowed')
if geneva_configuration and geneva_configuration.get("enable") == True:
hutil_log_info("Detected Geneva+ mode; azuremonitoragentmgr service will be started to handle Geneva tenants")
ensure["azuremonitoragentmgr"] = True
if azure_monitor_configuration and azure_monitor_configuration.get("enable") == True:
hutil_log_info("Detected Azure Monitor+ mode; azuremonitoragent service will be started to handle Azure Monitor tenant")
ensure["azuremonitoragent"] = True
azure_monitor_public_settings = azure_monitor_configuration.get("configuration")
azure_monitor_protected_settings = protected_settings.get(AzureMonitorConfigKey) if protected_settings is not None else None
handle_mcs_config(azure_monitor_public_settings, azure_monitor_protected_settings, default_configs)
# Legacy schema
elif public_settings is not None and public_settings.get("GCS_AUTO_CONFIG") == True:
hutil_log_info("Detected Auto-Config mode; azuremonitoragentmgr service will be started to handle Geneva tenants")
ensure["azuremonitoragentmgr"] = True
elif (protected_settings is None or len(protected_settings) == 0) or (public_settings is not None and "proxy" in public_settings and "mode" in public_settings.get("proxy") and public_settings.get("proxy").get("mode") == "application"):
hutil_log_info("Detected Azure Monitor mode; azuremonitoragent service will be started to handle Azure Monitor configuration")
ensure["azuremonitoragent"] = True
handle_mcs_config(public_settings, protected_settings, default_configs)
else:
hutil_log_info("Detected Geneva mode; azuremonitoragent service will be started to handle Geneva configuration")
ensure["azuremonitoragent"] = True
is_gcs_single_tenant = True
handle_gcs_config(public_settings, protected_settings, default_configs)
# generate local syslog configuration files as in auto config syslog is not driven from DCR
# Note that internally AMCS with geneva config path can be used in which case syslog should be handled same way as default 1P
# generate local syslog configuration files as in 1P syslog is not driven from DCR
if GcsEnabled:
generate_localsyslog_configs(uses_gcs=True, uses_mcs=McsEnabled)
config_file = "/etc/default/azuremonitoragent"
temp_config_file = "/etc/default/azuremonitoragent_temp"
try:
if os.path.isfile(config_file):
new_config = "\n".join(["export {0}={1}".format(key, value) for key, value in default_configs.items()]) + "\n"
with open(temp_config_file, "w") as f:
f.write(new_config)
if not os.path.isfile(temp_config_file):
log_and_exit("Enable", GenericErrorCode, "Error while updating environment variables in {0}".format(config_file))
os.remove(config_file)
os.rename(temp_config_file, config_file)
else:
log_and_exit("Enable", GenericErrorCode, "Could not find the file {0}".format(config_file))
except Exception as e:
log_and_exit("Enable", GenericErrorCode, "Failed to add environment variables to {0}: {1}".format(config_file, e))
if "ENABLE_MCS" in default_configs and default_configs["ENABLE_MCS"] == "true":
# enable processes for Custom Logs
ensure["azuremonitor-agentlauncher"] = True
ensure["azuremonitor-coreagent"] = True
# start the metrics, agent transform and syslog watchers only in 3P mode
start_metrics_process()
start_syslogconfig_process()
elif ensure.get("azuremonitoragentmgr") or is_gcs_single_tenant:
# In GCS scenarios, ensure that AMACoreAgent is running
ensure["azuremonitor-coreagent"] = True
hutil_log_info('Handler initiating onboarding.')
if HUtilObject and HUtilObject.is_seq_smaller():
# Either upgrade has just happened (in which case we need to start), or enable was called with no change to extension config
hutil_log_info("Current sequence number, " + HUtilObject._context._seq_no + ", is not greater than the LKG sequence number. Starting service(s) only if it is not yet running.")
operations = ["start", "enable"]
else:
# Either this is a clean install (in which case restart is effectively start), or extension config has changed
hutil_log_info("Current sequence number, " + HUtilObject._context._seq_no + ", is greater than the LKG sequence number. Restarting service(s) to pick up the new config.")
operations = ["restart", "enable"]
output = ""
# Ensure non-required services are not running; do not block if this step fails
for service in [s for s in ensure.keys() if not ensure[s]]:
exit_code, disable_output = run_command_and_log(get_service_command(service, "stop", "disable"))
output += disable_output
for service in [s for s in ensure.keys() if ensure[s]]:
exit_code, enable_output = run_command_and_log(get_service_command(service, *operations))
output += enable_output
if exit_code != 0:
status_command = get_service_command(service, "status")
status_exit_code, status_output = run_command_and_log(status_command)
if status_exit_code != 0:
output += "Output of '{0}':\n{1}".format(status_command, status_output)
return exit_code, output
if platform.machine() != 'aarch64':
if "ENABLE_MCS" in default_configs and default_configs["ENABLE_MCS"] == "true":
# start/enable kql extension only in 3P mode and non aarch64
kql_start_code, kql_output = run_command_and_log(get_service_command("azuremonitor-kqlextension", *operations))
output += kql_output # do not block if kql start fails
# start transformation config watcher process
start_transformconfig_process()
# Service(s) were successfully configured and started; increment sequence number
HUtilObject.save_seq()
return exit_code, output
def handle_gcs_config(public_settings, protected_settings, default_configs):
"""
Populate the defaults for legacy-path GCS mode
"""
# look for LA protected settings
for var in list(protected_settings.keys()):
if "_key" in var or "_id" in var:
default_configs[var] = protected_settings.get(var)
# check if required GCS params are available
MONITORING_GCS_CERT_CERTFILE = None
if "certificate" in protected_settings:
MONITORING_GCS_CERT_CERTFILE = base64.standard_b64decode(protected_settings.get("certificate"))
if "certificatePath" in protected_settings:
try:
with open(protected_settings.get("certificatePath"), 'r') as f:
MONITORING_GCS_CERT_CERTFILE = f.read()
except Exception as ex:
log_and_exit('Enable', MissingorInvalidParameterErrorCode, 'Failed to read certificate {0}: {1}'.format(protected_settings.get("certificatePath"), ex))
MONITORING_GCS_CERT_KEYFILE = None
if "certificateKey" in protected_settings:
MONITORING_GCS_CERT_KEYFILE = base64.standard_b64decode(protected_settings.get("certificateKey"))
if "certificateKeyPath" in protected_settings:
try:
with open(protected_settings.get("certificateKeyPath"), 'r') as f:
MONITORING_GCS_CERT_KEYFILE = f.read()
except Exception as ex:
log_and_exit('Enable', MissingorInvalidParameterErrorCode, 'Failed to read certificate key {0}: {1}'.format(protected_settings.get("certificateKeyPath"), ex))
MONITORING_GCS_ENVIRONMENT = ""
if "monitoringGCSEnvironment" in protected_settings:
MONITORING_GCS_ENVIRONMENT = protected_settings.get("monitoringGCSEnvironment")
MONITORING_GCS_NAMESPACE = ""
if "namespace" in protected_settings:
MONITORING_GCS_NAMESPACE = protected_settings.get("namespace")
MONITORING_GCS_ACCOUNT = ""
if "monitoringGCSAccount" in protected_settings:
MONITORING_GCS_ACCOUNT = protected_settings.get("monitoringGCSAccount")
MONITORING_GCS_REGION = ""
if "monitoringGCSRegion" in protected_settings:
MONITORING_GCS_REGION = protected_settings.get("monitoringGCSRegion")
MONITORING_CONFIG_VERSION = ""
if "configVersion" in protected_settings:
MONITORING_CONFIG_VERSION = protected_settings.get("configVersion")
MONITORING_GCS_AUTH_ID_TYPE = ""
if "monitoringGCSAuthIdType" in protected_settings:
MONITORING_GCS_AUTH_ID_TYPE = protected_settings.get("monitoringGCSAuthIdType")