Coverage for aisdb/webdata/_scraper.py: 91%

22 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-30 04:22 +0000

1''' webscraper using selenium, firefox, and mozilla geckodriver ''' 

2 

3import os 

4import shutil 

5 

6 

7def _scraper(): 

8 ''' selenium web scraper ``selenium.webdriver`` 

9 

10 to open a browser window while debugging, export DEBUG=1 

11 ''' 

12 from selenium import webdriver 

13 from selenium.webdriver.firefox.options import Options 

14 from selenium.webdriver.firefox.service import Service 

15 from webdriver_manager.firefox import GeckoDriverManager 

16 # from selenium.webdriver.chrome.options import Options 

17 # from selenium.webdriver.chrome.service import Service 

18 # from webdriver_manager.chrome import ChromeDriverManager 

19 # assert shutil.which('firefox') is not None, f'Firefox is required for this feature. {shutil.which("firefox")=}' 

20 

21 # configs 

22 opt = Options() 

23 #opt.headless = True if not os.environ.get('DEBUG') else False 

24 opt.set_preference('permissions.default.image', 2) 

25 opt.set_preference('extensions.contentblocker.enabled', True) 

26 opt.set_preference('media.autoplay.default', 2) 

27 opt.set_preference('media.autoplay.allow-muted', False) 

28 opt.set_preference('media.autoplay.block-event.enabled', True) 

29 opt.set_preference('media.autoplay.block-webaudio', True) 

30 opt.set_preference('services.sync.prefs.sync.media.autoplay.default', 

31 False) 

32 opt.set_preference('ui.context_menus.after_mouseup', False) 

33 opt.set_preference('privacy.sanitize.sanitizeOnShutdown', True) 

34 opt.set_preference('dom.disable_beforeunload', True) 

35 if not os.environ.get('DEBUG') and not os.environ.get('HEADLESS') == '0': 

36 opt.add_argument('-headless') 

37 """ chrome args 

38 opt.add_argument('--headless') 

39 opt.add_argument(f'user-data-dir={data_dir}') 

40 opt.add_argument('permissions.default.image=2') 

41 opt.add_argument('extensions.contentblocker.enabled=True') 

42 opt.add_argument('media.autoplay.default=2') 

43 opt.add_argument('media.autoplay.allow-muted=False') 

44 opt.add_argument('media.autoplay.block-event.enabled=True') 

45 opt.add_argument('media.autoplay.block-webaudio=True') 

46 opt.add_argument( 

47 'services.sync.prefs.sync.media.autoplay.default=False') 

48 opt.add_argument('ui.context_menus.after_mouseup=False') 

49 opt.add_argument('privacy.sanitize.sanitizeOnShutdown=True') 

50 opt.add_argument('dom.disable_beforeunload=True') 

51 """ 

52 

53 driver = webdriver.Firefox( 

54 service=Service(executable_path=GeckoDriverManager().install()), 

55 options=opt) 

56 

57 if os.environ.get('DEBUG'): 

58 driver.maximize_window() 

59 else: 

60 driver.set_window_size(9999, 9999) 

61 

62 return driver