diff --git a/.dockerignore b/.dockerignore index 5518e60..79d8c95 100644 --- a/.dockerignore +++ b/.dockerignore @@ -4,7 +4,11 @@ .gitignore .gitlab-ci.yml .idea +.ipython .pre-commit-config.yaml .readthedocs.yml .travis.yml +.continue +.envs/.local/.django +temp venv diff --git a/.envs/.local/.django b/.envs/.local/.django index 168e012..53f6444 100644 --- a/.envs/.local/.django +++ b/.envs/.local/.django @@ -15,3 +15,13 @@ CELERY_BROKER_URL=redis://redis:6379/0 # ------------------------------------------------------------------------------ CELERY_FLOWER_USER=PhFRdLexbrsBvrrbSXxjcMMOcVOavCrZ CELERY_FLOWER_PASSWORD=QgScyefPrYhHgO6onW61u0nazc5xdBuP4sM7jMRrBBFuA2RjsFhZLp7xbVYZbrwR + +# OpenSearch +# ------------------------------------------------------------------------------ +OPENSEARCH_URL=http://172.20.0.1:9200 +OPENSEARCH_BASIC_AUTH=admin,UmaSenhaForte123! +OPENSEARCH_VERIFY_CERTS=False + +# SciELO Books API +-------------------------------------------------------------------------------- +SCIELO_BOOKS_BASE_URL=http://192.168.2.154:31735 diff --git a/.gitignore b/.gitignore index 6342047..dd2c92d 100644 --- a/.gitignore +++ b/.gitignore @@ -169,4 +169,11 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +.idea/ + +# Local agent/editor state +.continue/ +temp/ +.envs/.local/.django +start-dev.sh +opencode.json diff --git a/.pylintrc b/.pylintrc deleted file mode 100644 index 55509fe..0000000 --- a/.pylintrc +++ /dev/null @@ -1,14 +0,0 @@ -[MASTER] -load-plugins=pylint_django, pylint_celery -django-settings-module=config.settings.base -[FORMAT] -max-line-length=120 - -[MESSAGES CONTROL] -disable=missing-docstring,invalid-name - -[DESIGN] -max-parents=13 - -[TYPECHECK] -generated-members=REQUEST,acl_users,aq_parent,"[a-zA-Z]+_set{1,2}",save,delete diff --git a/.readthedocs.yml b/.readthedocs.yml deleted file mode 100644 index b4cf0c0..0000000 --- a/.readthedocs.yml +++ /dev/null @@ -1,12 +0,0 @@ -version: 2 - -sphinx: - configuration: docs/conf.py - -build: - image: testing - -python: - version: 3.9 - install: - - requirements: requirements/local.txt diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..1f28439 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,83 @@ +# AGENTS.md + +## Project + +Django 5.2 + Wagtail 7.3 + Celery app that ingests SciELO access logs, validates them, and exports COUNTER-5 metrics to OpenSearch with monthly indices and daily nested metrics. + +## Key commands + +All commands run inside Docker via the `local.yml` compose file unless noted. + +```bash +make build # build images +make up # start all services (django, postgres, redis, celery worker+beat, mailhog) +make django_shell # Django shell via docker compose +make django_test # run full test suite (pytest) +make django_fast # tests with --failfast +make django_migrate # apply migrations +make django_makemigrations # generate new migrations +make django_createsuperuser # create Wagtail admin user +``` + +**Run a single test file/path:** +```bash +docker compose -f local.yml run --rm django pytest path/to/test_file.py +``` + +**Without Docker** (rare): use `start-dev.sh` after adjusting the ethernet interface name. + +## Architecture + +- **Wagtail admin**: `http://localhost:8009/admin` (NOT Django admin at `/django-admin/`) +- **Django apps** (top-level dirs): `core` (Wagtail pages, users, utilities, collectors), `collection`, `log_manager`, `log_manager_config`, `metrics`, `document`, `reports`, `resources`, `source`, `tracker`, `core_settings` +- **`core/`** contains utilities, shared models, Wagtail hooks, templates, and the `collectors/` subpackage. `config/` is the Django project package (settings, urls, celery_app, wsgi). +- **Celery pipeline**: `task_daily_log_ingestion_pipeline` (auto-scheduled) chains Search -> Validate -> Parse -> Export using Celery chords. Individual steps can be triggered manually via Wagtail admin. +- **Task names** use translatable strings, e.g. `_[Log Pipeline] 1. Search Logs (Manual)` — do not rename these casually, it breaks the schedule. + +## Settings + +- `DJANGO_SETTINGS_MODULE` defaults to `config.settings.local` +- Tests use `config.settings.test` (set via `pytest.ini` `--ds=config.settings.test`) +- Env files live in `.envs/.local/` (local) and `.envs/.production/` (production) +- **`config/settings/test.py`** is minimal — it extends `base.py` and does NOT load local.py. If a test needs a setting that only exists in local.py, it must be added to test.py or set in the test directly. + +## Testing + +- Framework: **pytest** (not Django's `TestCase` runner), with `--reuse-db` by default +- Config: `pytest.ini` sets `--ds=config.settings.test --reuse-db` +- Both `unittest.TestCase` (Django-style) and pytest-style tests coexist; `pytest` is the runner +- CI runs: `build -> makemigrations -> migrate -> pytest` +- Shared fixtures in `core/conftest.py` (autouse `media_storage`, `user` fixture via factory-boy) + +## Linting & formatting + +- **black** (line length 120 implied by flake8 config; black defaults to 88 — pre-commit config pins it) +- **isort** (black profile via `line_length=88`) +- **flake8** (max-line-length=120 via setup.cfg) +- Pre-commit runs all three on commit. Configuration in `setup.cfg` (flake8, isort, mypy) and `.pre-commit-config.yaml`. + +## Local dev quirks + +- Two SciELO libs (`scielo_log_validator`, `scielo_usage_counter`) are installed from local repos mounted at `/app/scielo_log_validator` and `/app/scielo_usage_counter` when `USE_LOCAL_SCIELO_LIBS=1`. The local Dockerfile strips these from `base.txt` during build and installs them from the mounted volumes via the entrypoint script. +- Log files volume: `/mnt/pidata2/pi/scl/logs:/app/logs` (host-specific, may not exist on all machines) +- Mailhog UI at `http://localhost:8029` +- `manage.py` appends `core/` to `sys.path` so `from core.utils import ...` and `from utils import ...` both resolve. + +## OpenSearch + +- Client configured via `OPENSEARCH_URL`, `OPENSEARCH_BASIC_AUTH`, `OPENSEARCH_VERIFY_CERTS` +- Index naming: `usage_monthly_{collection}_{year}` (e.g. `usage_monthly_books_2026`) +- Upserts use Painless scripts for idempotent daily metric merging +- `OPENSEARCH_INDEX_NAME` (default `usage`) and `OPENSEARCH_API_KEY` are defined in base settings but not widely used + +## MCP tools + +- When you need to search framework/library docs (Django, Wagtail, Celery, OpenSearch, etc.), use `context7` tools. +- When you need to find code examples or patterns from open-source projects, use `gh_grep` tools. + +## Wagtail-specific notes + +- Multi-language: `pt-br` (default), `en`, `es` +- Wagtail URL prefixes disabled (`prefix_default_language=False`) +- After adding a language, run `make wagtail_sync` and `make wagtail_update_translation_field` +- `wagtail-modeladmin` is used for managing pipeline entities in admin diff --git a/COPYING b/COPYING deleted file mode 100644 index 94a9ed0..0000000 --- a/COPYING +++ /dev/null @@ -1,674 +0,0 @@ - GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The GNU General Public License is a free, copyleft license for -software and other kinds of works. - - The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -the GNU General Public License is intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains free -software for all its users. We, the Free Software Foundation, use the -GNU General Public License for most of our software; it applies also to -any other work released this way by its authors. You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - - To protect your rights, we need to prevent others from denying you -these rights or asking you to surrender the rights. Therefore, you have -certain responsibilities if you distribute copies of the software, or if -you modify it: responsibilities to respect the freedom of others. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must pass on to the recipients the same -freedoms that you received. You must make sure that they, too, receive -or can get the source code. And you must show them these terms so they -know their rights. - - Developers that use the GNU GPL protect your rights with two steps: -(1) assert copyright on the software, and (2) offer you this License -giving you legal permission to copy, distribute and/or modify it. - - For the developers' and authors' protection, the GPL clearly explains -that there is no warranty for this free software. For both users' and -authors' sake, the GPL requires that modified versions be marked as -changed, so that their problems will not be attributed erroneously to -authors of previous versions. - - Some devices are designed to deny users access to install or run -modified versions of the software inside them, although the manufacturer -can do so. This is fundamentally incompatible with the aim of -protecting users' freedom to change the software. The systematic -pattern of such abuse occurs in the area of products for individuals to -use, which is precisely where it is most unacceptable. Therefore, we -have designed this version of the GPL to prohibit the practice for those -products. If such problems arise substantially in other domains, we -stand ready to extend this provision to those domains in future versions -of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. -States should not allow patents to restrict development and use of -software on general-purpose computers, but in those that do, we wish to -avoid the special danger that patents applied to a free program could -make it effectively proprietary. To prevent this, the GPL assures that -patents cannot be used to render the program non-free. - - The precise terms and conditions for copying, distribution and -modification follow. - - TERMS AND CONDITIONS - - 0. Definitions. - - "This License" refers to version 3 of the GNU General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of -works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of an -exact copy. The resulting work is called a "modified version" of the -earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based -on the Program. - - To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user through -a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" -to the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - - 1. Source Code. - - The "source code" for a work means the preferred form of the work -for making modifications to it. "Object code" means any non-source -form of a work. - - A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - - The Corresponding Source need not include anything that users -can regenerate automatically from other parts of the Corresponding -Source. - - The Corresponding Source for a work in source code form is that -same work. - - 2. Basic Permissions. - - All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not -convey, without conditions so long as your license otherwise remains -in force. You may convey covered works to others for the sole purpose -of having them make modifications exclusively for you, or provide you -with facilities for running those works, provided that you comply with -the terms of this License in conveying all material for which you do -not control copyright. Those thus making or running the covered works -for you must do so exclusively on your behalf, under your direction -and control, on terms that prohibit them from making any copies of -your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under -the conditions stated below. Sublicensing is not allowed; section 10 -makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - - No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - - When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such circumvention -is effected by exercising rights under this License with respect to -the covered work, and you disclaim any intention to limit operation or -modification of the work as a means of enforcing, against the work's -users, your or third parties' legal rights to forbid circumvention of -technological measures. - - 4. Conveying Verbatim Copies. - - You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - - You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is - released under this License and any conditions added under section - 7. This requirement modifies the requirement in section 4 to - "keep intact all notices". - - c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - - A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - - 6. Conveying Non-Source Forms. - - You may convey a covered work in object code form under the terms -of sections 4 and 5, provided that you also convey the -machine-readable Corresponding Source under the terms of this License, -in one of these ways: - - a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the - Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. - - d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided - you inform other peers where the object code and Corresponding - Source of the work are being offered to the general public at no - charge under subsection 6d. - - A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, family, -or household purposes, or (2) anything designed or sold for incorporation -into a dwelling. In determining whether a product is a consumer product, -doubtful cases shall be resolved in favor of coverage. For a particular -product received by a particular user, "normally used" refers to a -typical or common use of that class of product, regardless of the status -of the particular user or of the way in which the particular user -actually uses, or expects or is expected to use, the product. A product -is a consumer product regardless of whether the product has substantial -commercial, industrial or non-consumer uses, unless such uses represent -the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to install -and execute modified versions of a covered work in that User Product from -a modified version of its Corresponding Source. The information must -suffice to ensure that the continued functioning of the modified object -code is in no case prevented or interfered with solely because -modification has been made. - - If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - - The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or updates -for a work that has been modified or installed by the recipient, or for -the User Product in which it has been modified or installed. Access to a -network may be denied when the modification itself materially and -adversely affects the operation of the network or violates the rules and -protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - - 7. Additional Terms. - - "Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders of -that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or - requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or - authors of the material; or - - e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions of - it) with contractual assumptions of liability to the recipient, for - any liability that these contractual assumptions directly impose on - those licensors and authors. - - All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; -the above requirements apply either way. - - 8. Termination. - - You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - - However, if you cease all violation of this License, then your -license from a particular copyright holder is reinstated (a) -provisionally, unless and until the copyright holder explicitly and -finally terminates your license, and (b) permanently, if the copyright -holder fails to notify you of the violation by some reasonable means -prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - - Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - - 9. Acceptance Not Required for Having Copies. - - You are not required to accept this License in order to receive or -run a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - - Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - - 11. Patents. - - A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims -owned or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - - If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - - A patent license is "discriminatory" if it does not include within -the scope of its coverage, prohibits the exercise of, or is -conditioned on the non-exercise of one or more of the rights that are -specifically granted under this License. You may not convey a covered -work if you are a party to an arrangement with a third party that is -in the business of distributing software, under which you make payment -to the third party based on the extent of your activity of conveying -the work, and under which the third party grants, to any of the -parties who would receive the covered work from you, a discriminatory -patent license (a) in connection with copies of the covered work -conveyed by you (or copies made from those copies), or (b) primarily -for and in connection with specific products or compilations that -contain the covered work, unless you entered into that arrangement, -or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - - If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you may -not convey it at all. For example, if you agree to terms that obligate you -to collect a royalty for further conveying from those to whom you convey -the Program, the only way you could satisfy both those terms and this -License would be to refrain entirely from conveying the Program. - - 13. Use with the GNU Affero General Public License. - - Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU Affero General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the special requirements of the GNU Affero General Public License, -section 13, concerning interaction through a network will apply to the -combination as such. - - 14. Revised Versions of this License. - - The Free Software Foundation may publish revised and/or new versions of -the GNU General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - - Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU General Public License, you may choose any version ever published -by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future -versions of the GNU General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - - Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - - 15. Disclaimer of Warranty. - - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY -OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM -IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF -ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS -THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY -GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE -USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF -DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD -PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), -EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF -SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - - If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -state the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - - If the program does terminal interaction, make it output a short -notice like this when it starts in an interactive mode: - - Copyright (C) - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, your program's commands -might be different; for a GUI interface, you would use an "about box". - - You should also get your employer (if you work as a programmer) or school, -if any, to sign a "copyright disclaimer" for the program, if necessary. -For more information on this, and how to apply and follow the GNU GPL, see -. - - The GNU General Public License does not permit incorporating your program -into proprietary programs. If your program is a subroutine library, you -may consider it more useful to permit linking proprietary applications with -the library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. But first, please read -. diff --git a/README.md b/README.md index 5347d9d..4dd7a22 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,125 @@ -## SciELO Usage +# SciELO Usage Metrics Pipeline -## Dev Installation +A modernized platform for processing and indexing SciELO usage logs into OpenSearch, adhering to COUNTER R5.1 standards. -To build and run the application, being at the root of the project, you can follow these steps: +**Version**: 2.0.0 + +## Quick Start (Dev Installation) + +To build and run the application locally: 1. `make build compose=local.yml` -2. `make django_makemigrations` -3. `make django_migrate` -4. `make django_createsuperuser` -5. `make up` - -After the fifth step, the application should be functional and accessible at http://0.0.0.0:8009/admin - -### Additional notes: - -* The instructions assume that you have a working installation of Docker and `make`. -* The `make` commands use the `compose` file `local.yml` to start the application containers. -* The `django_makemigrations` and `django_migrate` commands are used to create and apply database migrations. -* The `django_createsuperuser` command is used to create a superuser account for the application. -* The `make up` command starts the application containers in the background. -* The application is accessible at http://0.0.0.0:8009/admin. -* To log in to the admin panel, you will need to use the superuser credentials that you created with the `django_createsuperuser` command. -* The `Log Manager` tool can be used to view log files and manage application configurations. -* To test the application, you will need to add some content, such as a list of collections and configurations. +2. `make django_migrate` +3. `make django_createsuperuser` +4. `make up` + +The application will be accessible at [http://localhost:8009/admin](http://localhost:8009/admin). + +--- + +## Key Commands + +All commands run inside Docker via the `local.yml` compose file unless noted. + +```bash +make build # build images +make up # start all services (django, postgres, redis, celery worker+beat, mailhog) +make django_shell # Django shell via docker compose +make django_test # run full test suite (pytest) +make django_fast # tests with --failfast +make django_migrate # apply migrations +make django_makemigrations # generate new migrations +make django_createsuperuser # create Wagtail admin user +``` + +**Run a single test file/path:** +```bash +docker compose -f local.yml run --rm django pytest path/to/test_file.py +``` + +## Architecture & Data Pipeline + +### Apps + +| App | Purpose | +|---|---| +| `log_manager` | Log file discovery, validation, and status tracking | +| `log_manager_config` | Collection-specific configuration (paths, emails, expected logs/day) | +| `metrics` | Daily metric jobs, OpenSearch export, COUNTER R5.1 aggregation | +| `document` | Unified metadata model for articles, books, chapters, datasets, and preprints | +| `source` | Journal, book, preprint server, and data repository metadata | +| `reports` | Weekly, monthly, and yearly log processing reports | +| `resources` | Robot user-agent patterns and GeoIP MMDB management | +| `tracker` | Discarded line tracking and error logging | +| `core` | Wagtail pages, users, shared utilities, and external API collectors | +| `collection` | SciELO collection management | + +### Core Collectors (`core/collectors/`) + +| Collector | Source | +|---|---| +| `articlemeta.py` | ArticleMeta REST/Thrift API | +| `opac.py` | SciELO OPAC endpoint | +| `preprints.py` | SciELO Preprints OAI-PMH | +| `dataverse.py` | SciELO Data (Dataverse) | +| `scielo_books.py` | SciELO Books CouchDB changes feed | + +### Log Ingestion Pipeline + +The ingestion is fully automated via the **`[Log Pipeline] Daily Routine (Auto)`** task. It follows a strictly ordered sequence using Celery Chords: + +- **Search**: Scans configured directories for new `.log` or `.gz` files. +- **Validate**: Performs statistical sampling to ensure log integrity and detect the usage date. +- **Parse**: Extracts metrics using `scielo_usage_counter`, performs URL translation, and aggregates data. +- **Export**: Pushes results to OpenSearch using idempotent upsert scripts. + +### Metadata Synchronization + +Metadata is kept in sync with SciELO sources (ArticleMeta, OPAC, Books, etc.) via the **`[Metadata] Daily Sync Routine (Auto)`** task, which runs parallel workers to ensure documents and sources are always up to date. + +## Supported Log Formats + +| Format | Description | +|---|---| +| NCSA Extended | Standard Apache combined log format with optional domain prefix and IP list fields. | +| BunnyCDN | Pipe-delimited format with Unix timestamps (7 or 10 digits), country codes, and request IDs. | + +## Environment Variables + +| Variable | Default | Description | +|---|---|---| +| `OPENSEARCH_URL` | — | OpenSearch cluster URL | +| `OPENSEARCH_BASIC_AUTH` | — | OpenSearch basic auth credentials (`user:pass`) | +| `OPENSEARCH_VERIFY_CERTS` | `False` | Verify SSL certificates for OpenSearch connections | +| `USE_LOCAL_SCIELO_LIBS` | `0` | Mount local `scielo_log_validator` and `scielo_usage_counter` repos for development | +| `DJANGO_SETTINGS_MODULE` | `config.settings.local` | Django settings module | +| `REDIS_URL` | — | Redis connection URL for Celery | + +## OpenSearch Storage Strategy (Hybrid Monthly) + +To optimize storage and performance, this system employs a **Hybrid Granularity** approach in OpenSearch: + +- **Monthly Partitioning**: Indices are partitioned by month (e.g., `usage_monthly_books_2026`). +- **One Document per Month**: Each article/PID has exactly **one document per month**, drastically reducing the total document count (up to 30x reduction). +- **Daily Nested Metrics**: Daily granularity is preserved inside each monthly document using a `daily_metrics` object. +- **Atomic Upserts**: Data is merged using OpenSearch **Painless Scripts**, allowing multiple logs for the same day/month to be processed without data duplication or loss. + +## Management & Monitoring + +All pipelines can be monitored through the **Wagtail Admin**: + +- **Log Manager**: Monitor the status of individual log files (`QUEUED`, `PARSING`, `PROCESSED`). +- **Daily Metric Jobs**: Track the history of daily processing and OpenSearch export attempts. +- **Log Config**: Manage collection-specific settings, log paths, and notification emails. + +### Useful Commands + +- `make django_shell`: Access the Django interactive shell. +- `docker logs -f scielo_usage_local_celeryworker`: Monitor real-time task execution. + +## Dependencies + +- [scielo_log_validator](https://github.com/scieloorg/scielo_log_validator) — log file validation +- [scielo_usage_counter](https://github.com/scieloorg/scielo_usage_counter) — COUNTER R5.1 metrics extraction +- [device_detector](https://github.com/thinkwelltwd/device_detector) — client name/version detection +- [opensearch-py](https://github.com/opensearch-project/opensearch-py) — OpenSearch client diff --git a/VERSION b/VERSION index 850e742..227cea2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.14.0 +2.0.0 diff --git a/article/admin.py b/article/admin.py deleted file mode 100644 index 8c38f3f..0000000 --- a/article/admin.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.contrib import admin - -# Register your models here. diff --git a/article/management/commands/load_articles_by_year.py b/article/management/commands/load_articles_by_year.py deleted file mode 100644 index 335598e..0000000 --- a/article/management/commands/load_articles_by_year.py +++ /dev/null @@ -1,80 +0,0 @@ -from django.core.management.base import BaseCommand - -from article.tasks import task_load_article_from_opac, task_load_article_from_article_meta - - -class Command(BaseCommand): - help = 'Generate task requests for loading article data from Article Meta for each year from 1900 to 2025' - - def add_arguments(self, parser): - parser.add_argument( - '--start-year', - type=int, - default=1990, - help='Start year (default: 1990)' - ) - parser.add_argument( - '--end-year', - type=int, - default=2025, - help='End year (default: 2025)' - ) - parser.add_argument( - '--collection', - type=str, - default='scl', - help='Collection code (default: scl)' - ) - parser.add_argument( - '--task', - choices=['load_article_from_opac', 'load_article_from_article_meta'], - default='load_article_from_opac', - help='Task to execute (default: load_article_from_opac)', - ) - - def handle(self, *args, **options): - start_year = options['start_year'] - end_year = options['end_year'] - collection = options['collection'] - - self.stdout.write( - self.style.SUCCESS( - f'Generating task requests from {start_year} to {end_year} for collection: {collection}' - ) - ) - - total_tasks = 0 - - for year in range(start_year, end_year + 1): - from_date = f'{year}-01-01' - until_date = f'{year}-12-31' - - self.stdout.write(f'Queuing task for year {year}...') - - # Queue the task for each year - if options['task'] == 'load_article_from_article_meta': - task_result = task_load_article_from_article_meta.delay( - from_date=from_date, - until_date=until_date, - collection=collection - ) - else: - task_result = task_load_article_from_opac.delay( - from_date=from_date, - until_date=until_date, - collection=collection - ) - - total_tasks += 1 - - self.stdout.write( - self.style.SUCCESS( - f'✓ Task queued for year {year}: {from_date} to {until_date} (Task ID: {task_result.id})' - ) - ) - - self.stdout.write( - self.style.SUCCESS( - f'\nCompleted! {total_tasks} tasks have been queued successfully.' - ) - ) diff --git a/article/migrations/0001_initial.py b/article/migrations/0001_initial.py deleted file mode 100644 index 816d61e..0000000 --- a/article/migrations/0001_initial.py +++ /dev/null @@ -1,137 +0,0 @@ -# Generated by Django 5.0.7 on 2025-02-07 17:50 - -import django.db.models.deletion -from django.conf import settings -from django.db import migrations, models - - -class Migration(migrations.Migration): - initial = True - - dependencies = [ - ("collection", "0001_initial"), - migrations.swappable_dependency(settings.AUTH_USER_MODEL), - ] - - operations = [ - migrations.CreateModel( - name="Article", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "created", - models.DateTimeField( - auto_now_add=True, verbose_name="Creation date" - ), - ), - ( - "updated", - models.DateTimeField( - auto_now=True, verbose_name="Last update date" - ), - ), - ( - "scielo_issn", - models.CharField( - db_index=True, max_length=9, verbose_name="SciELO ISSN" - ), - ), - ( - "pid_v2", - models.CharField( - db_index=True, max_length=23, verbose_name="PID V2" - ), - ), - ( - "pid_v3", - models.CharField( - blank=True, - db_index=True, - max_length=23, - null=True, - verbose_name="PID V3", - ), - ), - ( - "pdfs", - models.JSONField( - blank=True, - default=dict, - null=True, - verbose_name="Format with Language", - ), - ), - ( - "default_lang", - models.CharField(max_length=2, verbose_name="Default Language"), - ), - ( - "text_langs", - models.JSONField( - blank=True, - default=dict, - null=True, - verbose_name="Text Languages", - ), - ), - ( - "processing_date", - models.CharField(max_length=32, verbose_name="Processing Date"), - ), - ( - "publication_date", - models.CharField(max_length=32, verbose_name="Publication Date"), - ), - ( - "publication_year", - models.CharField( - db_index=True, max_length=4, verbose_name="Publication Year" - ), - ), - ( - "collection", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - to="collection.collection", - verbose_name="Collection", - ), - ), - ( - "creator", - models.ForeignKey( - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_creator", - to=settings.AUTH_USER_MODEL, - verbose_name="Creator", - ), - ), - ( - "updated_by", - models.ForeignKey( - blank=True, - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_last_mod_user", - to=settings.AUTH_USER_MODEL, - verbose_name="Updater", - ), - ), - ], - options={ - "verbose_name": "Article", - "verbose_name_plural": "Articles", - "unique_together": {("collection", "scielo_issn", "pid_v2", "pid_v3")}, - }, - ), - ] diff --git a/article/migrations/0002_alter_article_unique_together_article_files_and_more.py b/article/migrations/0002_alter_article_unique_together_article_files_and_more.py deleted file mode 100644 index cee055c..0000000 --- a/article/migrations/0002_alter_article_unique_together_article_files_and_more.py +++ /dev/null @@ -1,42 +0,0 @@ -# Generated by Django 5.0.7 on 2025-04-01 01:09 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("article", "0001_initial"), - ("collection", "0001_initial"), - ] - - operations = [ - migrations.AddField( - model_name="article", - name="files", - field=models.JSONField( - blank=True, default=dict, null=True, verbose_name="Files" - ), - ), - migrations.AddField( - model_name="article", - name="pid_generic", - field=models.CharField( - blank=True, - db_index=True, - max_length=50, - null=True, - verbose_name="PID Generic", - ), - ), - migrations.RemoveField( - model_name="article", - name="pdfs", - ), - migrations.AlterUniqueTogether( - name="article", - unique_together={ - ("collection", "scielo_issn", "pid_v2", "pid_v3", "pid_generic") - }, - ), - - ] diff --git a/article/migrations/0003_article_collection_scielo_issn_idx.py b/article/migrations/0003_article_collection_scielo_issn_idx.py deleted file mode 100644 index 753ac98..0000000 --- a/article/migrations/0003_article_collection_scielo_issn_idx.py +++ /dev/null @@ -1,21 +0,0 @@ -# Generated by Django 5.0.7 on 2025-06-12 17:16 - -from django.conf import settings -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("article", "0002_alter_article_unique_together_article_files_and_more"), - ("collection", "0001_initial"), - migrations.swappable_dependency(settings.AUTH_USER_MODEL), - ] - - operations = [ - migrations.AddIndex( - model_name="article", - index=models.Index( - fields=["collection", "scielo_issn"], name="collection_scielo_issn_idx" - ), - ), - ] diff --git a/article/migrations/__init__.py b/article/migrations/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/article/models.py b/article/models.py deleted file mode 100644 index 80d2a97..0000000 --- a/article/models.py +++ /dev/null @@ -1,143 +0,0 @@ -from django.db import models -from django.utils.translation import gettext_lazy as _ - -from core.models import CommonControlField -from collection.models import Collection - - -class Article(CommonControlField): - collection = models.ForeignKey( - Collection, - verbose_name=_('Collection'), - on_delete=models.CASCADE, - blank=False, - null=False, - db_index=True, - ) - - scielo_issn = models.CharField( - verbose_name=_('SciELO ISSN'), - max_length=9, - blank=False, - null=False, - db_index=True, - ) - - pid_v2 = models.CharField( - verbose_name=_('PID V2'), - max_length=23, - blank=False, - null=False, - db_index=True, - ) - - pid_v3 = models.CharField( - verbose_name=_('PID V3'), - max_length=23, - blank=True, - null=True, - db_index=True, - ) - - pid_generic = models.CharField( - verbose_name=_('PID Generic'), - max_length=50, - blank=True, - null=True, - db_index=True, - ) - - files = models.JSONField( - verbose_name=_('Files'), - null=True, - blank=True, - default=dict, - ) - - default_lang = models.CharField( - verbose_name=_('Default Language'), - max_length=2, - blank=False, - null=False, - ) - - text_langs = models.JSONField( - verbose_name=_('Text Languages'), - null=True, - blank=True, - default=dict, - ) - - processing_date = models.CharField( - verbose_name=_('Processing Date'), - max_length=32, - null=False, - blank=False, - ) - - publication_date = models.CharField( - verbose_name=_('Publication Date'), - max_length=32, - null=False, - blank=False, - ) - - publication_year = models.CharField( - verbose_name=_('Publication Year'), - max_length=4, - null=False, - blank=False, - db_index=True, - ) - - def __str__(self): - return f'{self.collection.acron3} - {self.scielo_issn} - {self.pid_v2 or self.pid_v3 or self.pid_generic}' - - @classmethod - def metadata(cls, collection=None): - qs = cls.objects.select_related('collection').only( - 'collection__acron3', - 'default_lang', - 'files', - 'pid_v2', - 'pid_v3', - 'pid_generic', - 'processing_date', - 'publication_date', - 'publication_year', - 'scielo_issn', - 'text_langs', - ) - - if collection: - qs = qs.filter(collection=collection) - - for a in qs.iterator(): - yield { - 'collection': a.collection.acron3, - 'default_lang': a.default_lang, - 'files': a.files, - 'pid_v2': a.pid_v2, - 'pid_v3': a.pid_v3, - 'pid_generic': a.pid_generic, - 'processing_date': a.processing_date, - 'publication_date': a.publication_date, - 'publication_year': a.publication_year, - 'scielo_issn': a.scielo_issn, - 'text_langs': a.text_langs, - } - - class Meta: - verbose_name = _('Article') - verbose_name_plural = _('Articles') - unique_together = ( - 'collection', - 'scielo_issn', - 'pid_v2', - 'pid_v3', - 'pid_generic', - ) - indexes = [ - models.Index(fields=['collection', 'scielo_issn'], name='collection_scielo_issn_idx'), - ] - diff --git a/article/tasks.py b/article/tasks.py deleted file mode 100644 index 3514fca..0000000 --- a/article/tasks.py +++ /dev/null @@ -1,259 +0,0 @@ -import logging - -from django.contrib.auth import get_user_model -from django.db.models import Q -from django.db import DataError -from django.utils.translation import gettext as _ - -from collection.models import Collection -from config import celery_app -from core.utils import date_utils -from core.utils.utils import _get_user - -from journal.models import Journal - -from tracker.models import ArticleEvent -from tracker.choices import ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED, ARTICLE_EVENT_TYPE_DATA_ERROR - -from . import models, utils - - -User = get_user_model() - -@celery_app.task(bind=True, name=_('Load article data from Article Meta'), timelimit=-1, queue='load') -def task_load_article_from_article_meta(self, from_date=None, until_date=None, days_to_go_back=None, collection=None, issn=None, force_update=True, user_id=None, username=None): - user = _get_user(self.request, username=username, user_id=user_id) - - from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back) - logging.info(f'Loading articles from Article Meta. From: {from_date}, Until: {until_date}, Collection: {collection}, ISSN: {issn}.') - - offset = 0 - limit = 1000 - while True: - logging.info(f'{from_date}, {until_date}, {offset}, {limit}, {collection}, {issn}') - response = utils.fetch_article_meta_dict(from_date, until_date, offset=offset, limit=limit, collection=collection, issn=issn) - objects = response.get('objects') - if not objects: - break - - for obj in objects: - codes = obj.get('code_title') - - for issn_code in codes: - jou = Journal.objects.filter( - Q(issns__electronic_issn=issn_code) | - Q(issns__scielo_issn=issn_code) | - Q(issns__print_issn=issn_code) - ).first() - if not jou: - continue - - if not jou: - logging.info(f'Journal not found for ISSNs: {codes}') - continue - - col_obj = Collection.objects.get(acron3=obj.get('collection')) - if not col_obj: - logging.info(f'Collection not found: {obj.get("collection")}') - continue - - try: - article, created = models.Article.objects.get_or_create(collection=col_obj, scielo_issn=jou.scielo_issn, pid_v2=obj.get('code')) - if created or force_update: - article.files = obj.get('pdfs') or {} - article.processing_date = obj.get('processing_date') or '' - article.publication_date = obj.get('publication_date') or '' - article.publication_year = obj.get('publication_year') or '' - article.default_lang = obj.get('default_language') or '' - article.text_langs = obj.get('text_langs') or '' - - article.save() - logging.info(f'Article {"created" if created else "updated"}: {article}') - except models.Article.MultipleObjectsReturned as e: - logging.error(f'Error getting Article: {e}. Collection: {col_obj}, ISSN: {jou.scielo_issn}, PIDv2: {obj.get("code")}') - ArticleEvent.create( - event_type=ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED, - message=f'Error getting Article: {e}. Collection: {col_obj}, ISSN: {jou.scielo_issn}, PIDv2: {obj.get("code")}', - data=obj - ) - continue - except DataError as e: - logging.error(f'Error saving Article: {e}. Collection: {col_obj}, ISSN: {jou.scielo_issn}, PIDv2: {obj.get("code")}') - ArticleEvent.create( - event_type=ARTICLE_EVENT_TYPE_DATA_ERROR, - message=f'Error saving Article: {e}. Collection: {col_obj}, ISSN: {jou.scielo_issn}, PIDv2: {obj.get("code")}', - data=obj - ) - continue - - offset += limit - - return True - - -@celery_app.task(bind=True, name=_('Load article data from OPAC'), timelimit=-1, queue='load') -def task_load_article_from_opac(self, collection='scl', from_date=None, until_date=None, days_to_go_back=None, page=1, force_update=True, user_id=None, username=None): - user = _get_user(self.request, username=username, user_id=user_id) - - from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back) - logging.info(f'Loading articles from OPAC. From: {from_date}, Until: {until_date}') - - while True: - response = utils.fetch_opac_dict(from_date, until_date, page=page) - - documents = response.get('documents') - - for doc_id, doc in documents.items(): - col_obj = Collection.objects.get(acron3=collection) - if not col_obj: - logging.error(f'Collection not found: {collection}') - continue - - journal = Journal.objects.get(collection=col_obj, acronym=doc.get('journal_acronym')) - if not journal: - logging.error(f'Journal not found: {doc.get("journal_acronym")}') - continue - - try: - article, created = models.Article.objects.get_or_create(collection=col_obj, scielo_issn=journal.scielo_issn, pid_v2=doc.get('pid_v2')) - - if created or force_update: - article.pid_v3 = doc.get('pid_v3') or '' - if not created: - article.pid_v2 = doc.get('pid_v2') or '' - article.publication_date = doc.get('publication_date') or article.publication_date or '' - article.default_lang = doc.get('default_language') or article.default_lang or '' - - try: - article.publication_year = article.publication_date[:4] - except IndexError: - article.publication_year = '' - - article.save() - logging.info(f'Article {"created" if created else "updated"}: {article}') - - except models.Article.MultipleObjectsReturned as e: - logging.error(f'Error getting Article: {e}. Collection: {col_obj}, Journal: {journal.scielo_issn}, PIDv2: {doc.get("pid_v2")}') - ArticleEvent.create( - event_type=ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED, - message=f'Error creating Article: {e}. Collection: {col_obj}, Journal: {journal.scielo_issn}, PIDv2: {doc.get("pid_v2")}', - data=doc - ) - continue - except DataError as e: - logging.error(f'Error saving Article: {e}. Collection: {col_obj}, Journal: {journal.scielo_issn}, PIDv2: {doc.get("pid_v2")}') - ArticleEvent.create( - event_type=ARTICLE_EVENT_TYPE_DATA_ERROR, - message=f'Error saving Article: {e}. Collection: {col_obj}, Journal: {journal.scielo_issn}, PIDv2: {doc.get("pid_v2")}', - data=doc - ) - continue - - page += 1 - if page > int(response.get('pages', 0)): - break - - return True - - -@celery_app.task(bind=True, name=_('Load preprint data from SciELO Preprints'), timelimit=-1, queue='load') -def task_load_preprints_from_preprints_api(self, from_date=None, until_date=None, days_to_go_back=None, force_update=True, user_id=None, username=None): - user = _get_user(self.request, username=username, user_id=user_id) - - from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back) - logging.info(f'Loading preprints from SciELO Preprints. From: {from_date}, Until: {until_date}') - - col_obj = Collection.objects.get(acron3='preprints') - if not col_obj: - logging.error(f'Collection not found: preprints') - return False - - for record in utils.fetch_preprint_oai_pmh(from_date, until_date): - data = utils.extract_preprint_data(record) - - if not data.get('pid_generic'): - logging.error(f'Preprint ID not found in record: {record}') - continue - - try: - article, created = models.Article.objects.get_or_create(collection=col_obj, pid_generic=data['pid_generic']) - if created or force_update: - article.text_langs = data.get('text_langs') - article.default_lang = data.get('default_language') - article.publication_date = data.get('publication_date') - article.publication_year = data.get('publication_year') - - # Preprints do not have a scielo_issn yet - article.scielo_issn = '0000-0000' - - article.save() - logging.debug(f'Article {"created" if created else "updated"}: {article}') - except models.Article.MultipleObjectsReturned as e: - logging.error(f'Error creating Article: {e}. Collection: {col_obj}, PID: {data["pid_generic"]}') - ArticleEvent.create( - event_type=ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED, - message=f'Error creating Article: {e}. Collection: {col_obj}, PID: {data["pid_generic"]}', - data=data - ) - continue - except DataError as e: - logging.error(f'Error saving Article: {e}. Collection: {col_obj}, PID: {data["pid_generic"]}') - ArticleEvent.create( - event_type=ARTICLE_EVENT_TYPE_DATA_ERROR, - message=f'Error saving Article: {e}. Collection: {col_obj}, PID: {data["pid_generic"]}', - data=data - ) - continue - - -@celery_app.task(bind=True, name=_('Load dataset metadata from Dataverse'), timelimit=-1, queue='load') -def task_load_dataset_metadata_from_dataverse(self, from_date=None, until_date=None, days_to_go_back=None, force_update=True, user_id=None, username=None): - user = _get_user(self.request, username=username, user_id=user_id) - - from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back) - logging.info(f'Loading dataset metadata from SciELO Data. From: {from_date}, Until: {until_date}') - - col_obj = Collection.objects.get(acron3='data') - if not col_obj: - logging.error(f'Collection not found: data') - return False - - for record in utils.fetch_dataverse_metadata(from_date, until_date): - dataset_doi = record.get('dataset_doi') - if not dataset_doi: - logging.error(f'Dataset DOI not found in record: {record}') - continue - - try: - dataset, created = models.Article.objects.get_or_create(collection=col_obj, pid_generic=dataset_doi) - if created or force_update: - dataset.publication_date = record.get('dataset_published') - - file_persistent_id = record.get('file_persistent_id') - file_id = record.get('file_id') - file_name = record.get('file_name') - file_url = record.get('file_url') - - if file_id: - dataset.files[file_id] = {'name': file_name, 'url': file_url, 'file_persisent_id': file_persistent_id} - - dataset.save() - logging.debug(f'Dataset {"created" if created else "updated"}: {dataset}') - except models.Article.MultipleObjectsReturned as e: - logging.error(f'Error creating Dataset: {e}. Collection: {col_obj}, PID: {dataset_doi}') - ArticleEvent.create( - event_type=ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED, - message=f'Error creating Dataset: {e}. Collection: {col_obj}, PID: {dataset_doi}', - data=record - ) - continue - except DataError as e: - logging.error(f'Error saving Dataset: {e}. Collection: {col_obj}, PID: {dataset_doi}') - ArticleEvent.create( - event_type=ARTICLE_EVENT_TYPE_DATA_ERROR, - message=f'Error saving Dataset: {e}. Collection: {col_obj}, PID: {dataset_doi}', - data=record - ) - continue - - return True diff --git a/article/tests.py b/article/tests.py deleted file mode 100644 index 7ce503c..0000000 --- a/article/tests.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.test import TestCase - -# Create your tests here. diff --git a/article/utils.py b/article/utils.py deleted file mode 100644 index b9a094e..0000000 --- a/article/utils.py +++ /dev/null @@ -1,204 +0,0 @@ -import logging -import requests -import os - -from sickle import Sickle -from time import sleep - -from core.utils import standardizer - - -ARTICLEMETA_ENDPOINT = os.environ.get('ARTICLEMETA_COLLECT_URL', 'http://articlemeta.scielo.org/api/v1/article/counter_dict') -ARTICLEMETA_MAX_RETRIES = int(os.environ.get('ARTICLEMETA_MAX_RETRIES', 5)) -ARTICLEMETA_SLEEP_TIME = int(os.environ.get('ARTICLEMETA_SLEEP_TIME', 30)) - -OPAC_ENDPOINT = os.environ.get('OPAC_ENDPOINT', 'https://www.scielo.br/api/v1/counter_dict') -OPAC_MAX_RETRIES = int(os.environ.get('OPAC_MAX_RETRIES', 5)) -OPAC_SLEEP_TIME = int(os.environ.get('OPAC_SLEEP_TIME', 30)) - -OAI_PMH_PREPRINT_ENDPOINT = os.environ.get('OAI_PMH_PREPRINT_ENDPOINT', 'https://preprints.scielo.org/index.php/scielo/oai') -OAI_METADATA_PREFIX = os.environ.get('OAI_METADATA_PREFIX', 'oai_dc') -OAI_PMH_MAX_RETRIES = int(os.environ.get('OAI_PMH_MAX_RETRIES', 5)) - -DATAVERSE_ENDPOINT = os.environ.get('DATAVERSE_ENDPOINT', 'https://data.scielo.org/api') -DATAVERSE_ROOT_COLLECTION = os.environ.get('DATAVERSE_ROOT_COLLECTION', 'scielodata') -DATAVERSE_MAX_RETRIES = int(os.environ.get('DATAVERSE_MAX_RETRIES', 5)) -DATAVERSE_SLEEP_TIME = int(os.environ.get('DATAVERSE_SLEEP_TIME', 30)) - - -def fetch_article_meta_dict(from_date, until_date, offset=0, limit=1000, collection=None, issn=None): - for t in range(1, ARTICLEMETA_MAX_RETRIES + 1): - params = { - 'from': from_date, - 'until': until_date, - 'offset': offset, - 'limit': limit - } - - if collection: - params['collection'] = collection - - if issn: - params['issn'] = issn - - response = requests.get(ARTICLEMETA_ENDPOINT, params=params) - - try: - response.raise_for_status() - logging.info(response.url) - - except requests.exceptions.HTTPError: - logging.warning( - 'Failed to collect data from %s. Waiting %d seconds before retry %d of %d' % ( - response.url, - ARTICLEMETA_SLEEP_TIME, - t, - ARTICLEMETA_MAX_RETRIES - ) - ) - sleep(ARTICLEMETA_SLEEP_TIME) - - else: - return response.json() - - -def fetch_opac_dict(from_date, until_date, page=1): - for t in range(1, OPAC_MAX_RETRIES + 1): - params = { - 'begin_date': from_date, - 'end_date': until_date, - 'page': page - } - - response = requests.get(url=OPAC_ENDPOINT, params=params, verify=False) - - try: - response.raise_for_status() - logging.info(response.url) - - except requests.exceptions.HTTPError: - logging.warning('Não foi possível coletar dados de %s. Aguardando %d segundos para tentativa %d de %d' % (response.url, OPAC_SLEEP_TIME, t, OPAC_MAX_RETRIES)) - sleep(OPAC_SLEEP_TIME) - - else: - return response.json() - - -def fetch_preprint_oai_pmh(from_date, until_date): - oai_client = Sickle(endpoint=OAI_PMH_PREPRINT_ENDPOINT, max_retries=OAI_PMH_MAX_RETRIES, verify=False) - records = oai_client.ListRecords(**{ - 'metadataPrefix': OAI_METADATA_PREFIX, - 'from': from_date, - 'until': until_date, - }) - - for r in records: - yield r - - -def extract_preprint_data(record): - pid_generic = _extract_preprint_compatible_identifer(record.header.identifier) - text_langs = [standardizer.standardize_language_code(l) for l in record.metadata.get('language', [])] - publication_date = record.metadata.get('date', [''])[0] - default_language = text_langs[0] if text_langs else '' - publication_year = _extract_preprint_publication_year_from_date(publication_date) - - data = { - 'pid_generic': pid_generic, - 'text_langs': text_langs, - 'publication_date': publication_date, - 'default_language': default_language, - 'publication_year': publication_year - } - - return data - - -def _extract_preprint_compatible_identifer(pid_v2): - try: - # piv_v2 should be something like oai:ops.preprints.scielo.org:preprint/1195 - # we are using the last part of the string as the identifier - return pid_v2.split(':')[-1].split('/')[1] - except IndexError: - return '' - - -def _extract_preprint_publication_year_from_date(date_str): - try: - return date_str[:4] - except IndexError: - return '' - - -def fetch_dataverse_metadata(from_date=None, until_date=None): - def get_subdataverses(): - url = f"{DATAVERSE_ENDPOINT}/dataverses/{DATAVERSE_ROOT_COLLECTION}/contents" - try: - response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME) - response.raise_for_status() - return response.json().get("data", []) - except requests.exceptions.RequestException as e: - logging.error(f"Error fetching subdataverses: {e}") - return [] - - def get_datasets(subdataverse_id): - url = f"{DATAVERSE_ENDPOINT}/dataverses/{subdataverse_id}/contents" - try: - response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME) - response.raise_for_status() - return response.json().get("data", []) - except requests.exceptions.RequestException as e: - logging.error(f"Error fetching datasets for subdataverse {subdataverse_id}: {e}") - return [] - - def get_files(dataset_id): - url = f"{DATAVERSE_ENDPOINT}/datasets/{dataset_id}/versions/:latest/files" - try: - response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME) - response.raise_for_status() - return response.json().get("data", []) - except requests.exceptions.RequestException as e: - logging.error(f"Error fetching files for dataset {dataset_id}: {e}") - return [] - - subdataverses = get_subdataverses() - - for subdataverse in subdataverses: - if subdataverse["type"] != "dataverse": - continue - - subdataverse_id = subdataverse["id"] - subdataverse_title = subdataverse["title"] - datasets = get_datasets(subdataverse_id) - - for dataset in datasets: - if dataset["type"] != "dataset": - continue - - dataset_id = dataset["id"] - doi = standardizer.standardize_doi(dataset.get("persistentUrl")) - if not doi: - logging.warning(f"Dataset {dataset_id} does not have a DOI.") - continue - - publication_date = dataset.get("publicationDate", None) - - if publication_date: - if (from_date and publication_date < from_date) or (until_date and publication_date > until_date): - continue - - files = get_files(dataset_id) - - for file in files: - file_persistent_id = file["dataFile"].get("persistentId", None) - file_persistent_id_stz = standardizer.standardize_pid_generic(file_persistent_id) if file_persistent_id else None - - yield { - "title": subdataverse_title, - "dataset_doi": doi, - "dataset_published": publication_date, - "file_id": file["dataFile"]["id"], - "file_name": file["label"], - "file_url": f"{DATAVERSE_ENDPOINT}/access/datafile/{file['dataFile']['id']}", - "file_persistent_id": file_persistent_id_stz, - } diff --git a/article/views.py b/article/views.py deleted file mode 100644 index 91ea44a..0000000 --- a/article/views.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.shortcuts import render - -# Create your views here. diff --git a/collection/models.py b/collection/models.py index f2d7ce7..87da123 100644 --- a/collection/models.py +++ b/collection/models.py @@ -9,7 +9,7 @@ from core.forms import CoreAdminModelForm from core.models import CommonControlField, Language, TextWithLang -from core.utils.utils import fetch_data +from core.utils.request_utils import fetch_data from . import choices diff --git a/collection/tasks.py b/collection/tasks.py index 02fd0e7..19372de 100644 --- a/collection/tasks.py +++ b/collection/tasks.py @@ -1,14 +1,14 @@ from django.contrib.auth import get_user_model from django.utils.translation import gettext as _ -from core.utils.utils import _get_user +from core.utils.request_utils import _get_user from collection.models import Collection from config import celery_app User = get_user_model() -@celery_app.task(bind=True, name=_('Load collection data')) +@celery_app.task(bind=True, name=_('[Collection] Load Collection Data')) def task_load_collections(self, user_id=None, username=None): user = _get_user(self.request, username=username, user_id=user_id) Collection.load(user) diff --git a/collection/wagtail_hooks.py b/collection/wagtail_hooks.py index e7b7e97..52b31a8 100644 --- a/collection/wagtail_hooks.py +++ b/collection/wagtail_hooks.py @@ -1,8 +1,5 @@ from django.utils.translation import gettext as _ from wagtail.snippets.views.snippets import SnippetViewSet -from wagtail.snippets.models import register_snippet - -from config.menu import get_menu_order from .models import Collection @@ -10,10 +7,8 @@ class CollectionSnippetViewSet(SnippetViewSet): model = Collection icon = "folder-open-inverse" - menu_name = 'collection' menu_label = _("Collection") - menu_order = get_menu_order("collection") - add_to_admin_menu = True + menu_order = 100 list_display = ( "main_name", @@ -57,6 +52,3 @@ class CollectionSnippetViewSet(SnippetViewSet): "updated_by", ) export_filename = "collections" - - -register_snippet(CollectionSnippetViewSet) diff --git a/compose/local/django/Dockerfile b/compose/local/django/Dockerfile index 4351d9e..aac7972 100644 --- a/compose/local/django/Dockerfile +++ b/compose/local/django/Dockerfile @@ -23,8 +23,7 @@ COPY ./requirements . RUN python -m pip install --upgrade pip # Create Python Dependency and Sub-Dependency Wheels. -RUN pip wheel --wheel-dir /usr/src/app/wheels \ - -r ${BUILD_ENVIRONMENT}.txt +RUN pip wheel --wheel-dir /usr/src/app/wheels -r ${BUILD_ENVIRONMENT}.txt # Python 'run' stage diff --git a/compose/local/django/celery/worker/start b/compose/local/django/celery/worker/start index 7db6f27..f0c7efc 100644 --- a/compose/local/django/celery/worker/start +++ b/compose/local/django/celery/worker/start @@ -21,4 +21,14 @@ watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concu # Worker para arg bol cub data ecu per preprints pry rve spa sss sza ury wid watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small -n worker.parse_small@%h & -wait \ No newline at end of file +# Workers seriais adicionais para backfill paralelo de colecoes pequenas +watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_1 -n worker.parse_small_1@%h & +watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_2 -n worker.parse_small_2@%h & +watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_3 -n worker.parse_small_3@%h & +watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_4 -n worker.parse_small_4@%h & +watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_5 -n worker.parse_small_5@%h & +watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_6 -n worker.parse_small_6@%h & +watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_7 -n worker.parse_small_7@%h & +watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_8 -n worker.parse_small_8@%h & + +wait diff --git a/compose/production/django/celery/worker/start b/compose/production/django/celery/worker/start index 4fb112e..6269dd5 100644 --- a/compose/production/django/celery/worker/start +++ b/compose/production/django/celery/worker/start @@ -22,4 +22,14 @@ celery -A config.celery_app worker -l INFO --concurrency=1 -Q parse_medium -n wo # Worker para arg bol cub data ecu per preprints pry rve spa sss sza ury wid (coleções pequenas) celery -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small -n worker.parse_small@%h & -wait \ No newline at end of file +# Workers seriais adicionais para backfill paralelo de colecoes pequenas +celery -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_1 -n worker.parse_small_1@%h & +celery -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_2 -n worker.parse_small_2@%h & +celery -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_3 -n worker.parse_small_3@%h & +celery -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_4 -n worker.parse_small_4@%h & +celery -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_5 -n worker.parse_small_5@%h & +celery -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_6 -n worker.parse_small_6@%h & +celery -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_7 -n worker.parse_small_7@%h & +celery -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_8 -n worker.parse_small_8@%h & + +wait diff --git a/compose/production/django/entrypoint b/compose/production/django/entrypoint index 599841e..02470cd 100644 --- a/compose/production/django/entrypoint +++ b/compose/production/django/entrypoint @@ -16,6 +16,20 @@ if [ -z "${POSTGRES_USER}" ]; then fi export DATABASE_URL="postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}" +if [ "${USE_LOCAL_SCIELO_LIBS:-0}" = "1" ]; then + for path in /app/scielo_log_validator /app/scielo_usage_counter; do + if [ ! -f "${path}/setup.py" ] && [ ! -f "${path}/pyproject.toml" ]; then + >&2 echo "Local lib path not ready: ${path}" + exit 1 + fi + done + + >&2 echo "Installing local SciELO libs from mounted repositories..." + pip install --root-user-action=ignore --no-cache-dir --no-build-isolation --no-deps \ + -e /app/scielo_log_validator \ + -e /app/scielo_usage_counter +fi + postgres_ready() { python << END import sys diff --git a/config/collections.py b/config/collections.py new file mode 100644 index 0000000..9aa3efe --- /dev/null +++ b/config/collections.py @@ -0,0 +1,63 @@ +COLLECTION_ACRON3_SIZE_MAP = { + "scl": "xlarge", + "chl": "large", + "col": "large", + "mex": "large", + "cri": "medium", + "esp": "medium", + "psi": "medium", + "prt": "medium", + "ven": "medium", + "arg": "small", + "bol": "small", + "books": "small", + "cub": "small", + "data": "small", + "dom": "small", + "ecu": "small", + "per": "small", + "preprints": "small", + "pry": "small", + "rve": "small", + "spa": "small", + "sss": "small", + "sza": "small", + "ury": "small", + "wid": "small", +} + +COLLECTION_SIZE_SAMPLE_MAP = { + "small": 1.0, + "medium": 0.5, + "large": 0.1, + "xlarge": 0.1, +} + +LOG_MANAGER_SEED_DATA = [ + {"acronym": "arg", "directory_name": "Site clássico", "path": "/app/logs/scielo.ar", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, + {"acronym": "bol", "directory_name": "Site clássico", "path": "/app/logs/scielo.bo", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, + {"acronym": "chl", "directory_name": "Site clássico", "path": "/app/logs/scielo.cl", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, + {"acronym": "col", "directory_name": "Site clássico", "path": "/app/logs/scielo.co", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, + {"acronym": "cri", "directory_name": "Site clássico", "path": "/app/logs/scielo.cr", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, + {"acronym": "cub", "directory_name": "Site clássico", "path": "/app/logs/scielo.cu", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, + {"acronym": "data", "directory_name": "Site clássico", "path": "/app/logs/dataverse", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "dataverse"}, + {"acronym": "dom", "directory_name": "Site novo", "path": "/app/logs/scielo.dom", "quantity": 1, "start_date": "2026-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "opac"}, + {"acronym": "ecu", "directory_name": "Site clássico", "path": "/app/logs/scielo.ec", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, + {"acronym": "esp", "directory_name": "Site clássico", "path": "/app/logs/scielo.es", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, + {"acronym": "mex", "directory_name": "Site clássico", "path": "/app/logs/scielo.mx", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, + {"acronym": "per", "directory_name": "Site clássico", "path": "/app/logs/scielo.pe", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, + {"acronym": "preprints", "directory_name": "Site clássico", "path": "/app/logs/submission-node01", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "preprints"}, + {"acronym": "prt", "directory_name": "Site clássico", "path": "/app/logs/scielo.pt", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, + {"acronym": "pry", "directory_name": "Site clássico", "path": "/app/logs/scielo.py", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, + {"acronym": "psi", "directory_name": "Site clássico", "path": "/app/logs/scielo.pepsic", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, + {"acronym": "rve", "directory_name": "Site clássico", "path": "/app/logs/scielo.revenf", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, + {"acronym": "rvt", "directory_name": "Site clássico", "path": "/app/logs/scielo.revtur", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, + {"acronym": "scl", "directory_name": "Site novo", "path": "/app/logs/scielo.br", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "opac"}, + {"acronym": "spa", "directory_name": "Site novo - versão prévia", "path": "/app/logs/scielo.sp", "quantity": 2, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "opac_alpha"}, + {"acronym": "sss", "directory_name": "Site clássico", "path": "/app/logs/scielo.ss", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, + {"acronym": "sza", "directory_name": "Site clássico", "path": "/app/logs/scielo.za", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, + {"acronym": "ury", "directory_name": "Site clássico", "path": "/app/logs/scielo.uy", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, + {"acronym": "ven", "directory_name": "Site clássico", "path": "/app/logs/scielo.ve", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, + {"acronym": "wid", "directory_name": "Site clássico", "path": "/app/logs/scielo.wi", "quantity": 2, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"}, + {"acronym": "books", "directory_name": "SciELO Books", "path": "/app/logs/books", "quantity": 1, "start_date": "2012-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "books"}, +] diff --git a/config/menu.py b/config/menu.py index 13371c6..844ce0c 100644 --- a/config/menu.py +++ b/config/menu.py @@ -1,13 +1,10 @@ WAGTAIL_MENU_APPS_ORDER = { - "collection": 100, - "article": 200, - "journal": 300, - "resources": 400, - "log_manager": 500, - "log_manager_config": 600, - "metrics": 700, - "tasks": 800, - "unexpected-error": 900, + "metadata": 100, + "resources": 200, + "log_manager": 300, + "tracker": 400, + "metrics": 500, + "tasks": 600, } def get_menu_order(app_name): diff --git a/config/settings/base.py b/config/settings/base.py index 4e96ed4..e4a99fa 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -5,7 +5,8 @@ from pathlib import Path import environ -from django.utils.translation import gettext_lazy as _ + +from config.collections import COLLECTION_ACRON3_SIZE_MAP # noqa: F401 ROOT_DIR = Path(__file__).resolve(strict=True).parent.parent.parent # core/ @@ -114,14 +115,15 @@ "core.users", "core_settings", # Your stuff: custom apps go here - "article", "collection", "core", - "journal", + "document", "log_manager", "log_manager_config", "metrics", + "reports", "resources", + "source", "tracker", ] @@ -404,36 +406,54 @@ SEARCH_PAGINATION_ITEMS_PER_PAGE = 10 -# Elasticsearch +# OpenSearch # ------------------------------------------------------------------------------ -ES_URL = env("ES_URL", default="http://192.168.0.33:9200/") -ES_INDEX_NAME = env("ES_INDEX_NAME", default="usage") -ES_API_KEY = env("ES_API_KEY", default="") -ES_BASIC_AUTH = env("ES_BASIC_AUTH", default=("elastic", "iHktg66E")) -ES_VERIFY_CERTS = env.bool("ES_VERIFY_CERTS", default=False) +OPENSEARCH_URL = env("OPENSEARCH_URL", default="http://localhost:9200/") +OPENSEARCH_INDEX_NAME = env("OPENSEARCH_INDEX_NAME", default="usage") +OPENSEARCH_API_KEY = env("OPENSEARCH_API_KEY", default="") +OPENSEARCH_BASIC_AUTH = env( + "OPENSEARCH_BASIC_AUTH", + default=("admin", "admin"), +) +OPENSEARCH_VERIFY_CERTS = env.bool( + "OPENSEARCH_VERIFY_CERTS", + default=False, +) + +# Collectors configuration +# ------------------------------------------------------------------------------ +# ArticleMeta +ARTICLEMETA_COLLECT_URL = env( + "ARTICLEMETA_COLLECT_URL", + default="http://articlemeta.scielo.org/api/v1/article/counter_dict", +) +ARTICLEMETA_MAX_RETRIES = env.int("ARTICLEMETA_MAX_RETRIES", default=5) +ARTICLEMETA_SLEEP_TIME = env.int("ARTICLEMETA_SLEEP_TIME", default=30) + +# Dataverse +DATAVERSE_ENDPOINT = env("DATAVERSE_ENDPOINT", default="https://data.scielo.org/api") +DATAVERSE_ROOT_COLLECTION = env("DATAVERSE_ROOT_COLLECTION", default="scielodata") +DATAVERSE_SLEEP_TIME = env.int("DATAVERSE_SLEEP_TIME", default=30) + +# OPAC +OPAC_ENDPOINT = env("OPAC_ENDPOINT", default="https://www.scielo.br/api/v1/counter_dict") +OPAC_MAX_RETRIES = env.int("OPAC_MAX_RETRIES", default=5) +OPAC_SLEEP_TIME = env.int("OPAC_SLEEP_TIME", default=30) + +# Preprints +OAI_PMH_PREPRINT_ENDPOINT = env( + "OAI_PMH_PREPRINT_ENDPOINT", + default="https://preprints.scielo.org/index.php/scielo/oai", +) +OAI_METADATA_PREFIX = env("OAI_METADATA_PREFIX", default="oai_dc") +OAI_PMH_MAX_RETRIES = env.int("OAI_PMH_MAX_RETRIES", default=5) + +# SciELO Books +SCIELO_BOOKS_BASE_URL = env("SCIELO_BOOKS_BASE_URL", default="http://localhost:5984") +SCIELO_BOOKS_TIMEOUT = env.int("SCIELO_BOOKS_TIMEOUT", default=60) +SCIELO_BOOKS_DB_NAME = env("SCIELO_BOOKS_DB_NAME", default="scielobooks_1a") +SCIELO_BOOKS_LIMIT = env.int("SCIELO_BOOKS_LIMIT", default=1000) # Collection size categories # ------------------------------------------------------------------------------ -EXTRA_LARGE_COLLECTIONS = env.list("EXTRA_LARGE_COLLECTIONS", default=["scl"]) -LARGE_COLLECTIONS = env.list("LARGE_COLLECTIONS", default=["chl", "col", "mex"]) -MEDIUM_COLLECTIONS = env.list("MEDIUM_COLLECTIONS", default=["cri", "esp", "psi", "prt", "ven"]) -SMALL_COLLECTIONS = env.list("SMALL_COLLECTIONS", default=["arg", "bol", "cub", "data", "ecu", "per", "preprints", "pry", "rve", "spa", "sss", "sza", "ury", "wid"]) - -# Collection size mapping -def _build_collection_size_map(): - """Build mapping of collection acronyms to their size categories.""" - size_map = {} - size_categories = { - "xlarge": EXTRA_LARGE_COLLECTIONS, - "large": LARGE_COLLECTIONS, - "medium": MEDIUM_COLLECTIONS, - "small": SMALL_COLLECTIONS, - } - - for size, collections in size_categories.items(): - for acron3 in collections: - size_map[acron3] = size - - return size_map - -COLLECTION_ACRON3_SIZE_MAP = _build_collection_size_map() +SUPPORTED_LOGFILE_EXTENSIONS = env.list("SUPPORTED_LOGFILE_EXTENSIONS", default=[".log", ".gz", ".zip"]) diff --git a/core/collectors/__init__.py b/core/collectors/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/core/collectors/__init__.py @@ -0,0 +1 @@ + diff --git a/core/collectors/articlemeta.py b/core/collectors/articlemeta.py new file mode 100644 index 0000000..7f6ace0 --- /dev/null +++ b/core/collectors/articlemeta.py @@ -0,0 +1,60 @@ +import logging + +import requests +from django.conf import settings +from articlemeta.client import RestfulClient, ThriftClient +from time import sleep + + +def fetch_article_counter_dict( + from_date, + until_date, + offset=0, + limit=1000, + collection=None, + issn=None, +): + for attempt in range(1, settings.ARTICLEMETA_MAX_RETRIES + 1): + params = { + "from": from_date, + "until": until_date, + "offset": offset, + "limit": limit, + } + + if collection: + params["collection"] = collection + + if issn: + params["issn"] = issn + + response = requests.get(settings.ARTICLEMETA_COLLECT_URL, params=params) + + try: + response.raise_for_status() + logging.info(response.url) + except requests.exceptions.HTTPError: + logging.warning( + "Failed to collect data from %s. Waiting %d seconds before retry %d of %d", + response.url, + settings.ARTICLEMETA_SLEEP_TIME, + attempt, + settings.ARTICLEMETA_MAX_RETRIES, + ) + sleep(settings.ARTICLEMETA_SLEEP_TIME) + else: + return response.json() + + return {} + + +def iter_journals(collection="scl", mode="rest"): + if mode == "rest": + client = RestfulClient() + elif mode == "thrift": + client = ThriftClient() + else: + raise ValueError(f"Unsupported ArticleMeta mode: {mode}") + + for journal in client.journals(collection=collection): + yield journal diff --git a/core/collectors/dataverse.py b/core/collectors/dataverse.py new file mode 100644 index 0000000..ca51fd7 --- /dev/null +++ b/core/collectors/dataverse.py @@ -0,0 +1,75 @@ +import logging + +import requests +from django.conf import settings + +from core.utils import standardizer + + +def _request_json(url): + try: + response = requests.get(url, timeout=settings.DATAVERSE_SLEEP_TIME) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as exc: + logging.error("Error fetching %s: %s", url, exc) + return {} + + +def _get_subdataverses(): + url = f"{settings.DATAVERSE_ENDPOINT}/dataverses/{settings.DATAVERSE_ROOT_COLLECTION}/contents" + return _request_json(url).get("data", []) + + +def _get_datasets(subdataverse_id): + url = f"{settings.DATAVERSE_ENDPOINT}/dataverses/{subdataverse_id}/contents" + return _request_json(url).get("data", []) + + +def _get_files(dataset_id): + url = f"{settings.DATAVERSE_ENDPOINT}/datasets/{dataset_id}/versions/:latest/files" + return _request_json(url).get("data", []) + + +def iter_dataset_metadata(from_date=None, until_date=None): + for subdataverse in _get_subdataverses(): + if subdataverse.get("type") != "dataverse": + continue + + subdataverse_id = subdataverse["id"] + subdataverse_title = subdataverse["title"] + + for dataset in _get_datasets(subdataverse_id): + if dataset.get("type") != "dataset": + continue + + dataset_id = dataset["id"] + doi = standardizer.standardize_doi(dataset.get("persistentUrl")) + if not doi: + logging.warning("Dataset %s does not have a DOI.", dataset_id) + continue + + publication_date = dataset.get("publicationDate") + if publication_date: + if (from_date and publication_date < from_date) or ( + until_date and publication_date > until_date + ): + continue + + for file_data in _get_files(dataset_id): + file_persistent_id = file_data["dataFile"].get("persistentId") + standardized_persistent_id = ( + standardizer.standardize_pid_generic(file_persistent_id) + if file_persistent_id + else None + ) + + yield { + "title": subdataverse_title, + "dataset_doi": doi, + "dataset_published": publication_date, + "file_id": file_data["dataFile"]["id"], + "file_name": file_data["label"], + "file_url": f"{settings.DATAVERSE_ENDPOINT}/access/datafile/{file_data['dataFile']['id']}", + "file_persistent_id": standardized_persistent_id, + } diff --git a/core/collectors/opac.py b/core/collectors/opac.py new file mode 100644 index 0000000..94122b7 --- /dev/null +++ b/core/collectors/opac.py @@ -0,0 +1,33 @@ +import logging + +import requests +from django.conf import settings +from time import sleep + + +def fetch_counter_dict(from_date, until_date, page=1): + for attempt in range(1, settings.OPAC_MAX_RETRIES + 1): + params = { + "begin_date": from_date, + "end_date": until_date, + "page": page, + } + + response = requests.get(url=settings.OPAC_ENDPOINT, params=params, verify=False) + + try: + response.raise_for_status() + logging.info(response.url) + except requests.exceptions.HTTPError: + logging.warning( + "Could not collect data from %s. Waiting %d seconds for attempt %d of %d", + response.url, + settings.OPAC_SLEEP_TIME, + attempt, + settings.OPAC_MAX_RETRIES, + ) + sleep(settings.OPAC_SLEEP_TIME) + else: + return response.json() + + return {} diff --git a/core/collectors/preprints.py b/core/collectors/preprints.py new file mode 100644 index 0000000..bead72c --- /dev/null +++ b/core/collectors/preprints.py @@ -0,0 +1,55 @@ +from django.conf import settings +from sickle import Sickle + +from core.utils import standardizer + + +def iter_records(from_date, until_date): + oai_client = Sickle( + endpoint=settings.OAI_PMH_PREPRINT_ENDPOINT, + max_retries=settings.OAI_PMH_MAX_RETRIES, + verify=False, + ) + records = oai_client.ListRecords( + **{ + "metadataPrefix": settings.OAI_METADATA_PREFIX, + "from": from_date, + "until": until_date, + } + ) + + for record in records: + yield record + + +def extract_record_data(record): + pid_generic = _extract_compatible_identifier(record.header.identifier) + text_langs = [ + standardizer.standardize_language_code(language) + for language in record.metadata.get("language", []) + ] + publication_date = record.metadata.get("date", [""])[0] + default_language = text_langs[0] if text_langs else "" + publication_year = _extract_publication_year_from_date(publication_date) + + return { + "pid_generic": pid_generic, + "text_langs": text_langs, + "publication_date": publication_date, + "default_language": default_language, + "publication_year": publication_year, + } + + +def _extract_compatible_identifier(identifier): + try: + return identifier.split(":")[-1].split("/")[1] + except IndexError: + return "" + + +def _extract_publication_year_from_date(date_str): + try: + return date_str[:4] + except IndexError: + return "" diff --git a/core/collectors/scielo_books.py b/core/collectors/scielo_books.py new file mode 100644 index 0000000..b1f2dd8 --- /dev/null +++ b/core/collectors/scielo_books.py @@ -0,0 +1,182 @@ +import logging + +import requests +from django.conf import settings +from urllib.parse import urlencode + + + + +def build_url(base_url, params=None): + if not params: + return base_url + return f"{base_url}?{urlencode(params, doseq=True)}" + + +def sanitize_raw_data(payload): + if not isinstance(payload, dict): + return payload + + if "_id" not in payload: + return payload + + sanitized = dict(payload) + sanitized["id"] = sanitized.pop("_id") + return sanitized + + +def fetch_document(doc_id, base_url=None, db_name=None, headers=None): + db_name = db_name or settings.SCIELO_BOOKS_DB_NAME + resolved_base_url = base_url or settings.SCIELO_BOOKS_BASE_URL + if not resolved_base_url: + logging.error("Sem base url definida para coleta de books") + raise ValueError("SCIELO_BOOKS_BASE_URL is not configured") + + url = f"{resolved_base_url}/{db_name}/{doc_id}" + response = requests.get(url, headers=headers, timeout=settings.SCIELO_BOOKS_TIMEOUT, verify=False) + response.raise_for_status() + payload = response.json() + return sanitize_raw_data(payload), url + + +def fetch_changes_page( + base_url=None, + db_name=None, + since=0, + limit=None, + include_docs=False, + headers=None, +): + db_name = db_name or settings.SCIELO_BOOKS_DB_NAME + limit = limit or settings.SCIELO_BOOKS_LIMIT + resolved_base_url = base_url or settings.SCIELO_BOOKS_BASE_URL + if not resolved_base_url: + logging.error("Sem base url definida para coleta de books") + raise ValueError("SCIELO_BOOKS_BASE_URL is not configured") + + params = { + "since": since, + "limit": limit, + } + if include_docs: + params["include_docs"] = "true" + + url = build_url(f"{resolved_base_url}/{db_name}/_changes", params) + response = requests.get(url, headers=headers, timeout=settings.SCIELO_BOOKS_TIMEOUT, verify=False) + response.raise_for_status() + payload = response.json() + return payload if isinstance(payload, dict) else {} + + +def extract_changes(payload): + if isinstance(payload, dict) and isinstance(payload.get("results"), list): + return payload.get("results") + return [] + + +def extract_last_seq(payload): + if isinstance(payload, dict): + return payload.get("last_seq") or payload.get("seq") + return None + + +def iter_changes( + base_url=None, + db_name=None, + since=0, + limit=None, + headers=None, +): + db_name = db_name or settings.SCIELO_BOOKS_DB_NAME + limit = limit or settings.SCIELO_BOOKS_LIMIT + current_since = since or 0 + + while True: + payload = fetch_changes_page( + base_url=base_url, + db_name=db_name, + since=current_since, + limit=limit, + include_docs=False, + headers=headers, + ) + changes = extract_changes(payload) + if not changes: + break + + for change in changes: + yield change + + last_seq = extract_last_seq(payload) + if last_seq is None or last_seq == current_since: + break + current_since = last_seq + + +def iter_change_documents( + base_url=None, + db_name=None, + since=0, + limit=None, + headers=None, +): + db_name = db_name or settings.SCIELO_BOOKS_DB_NAME + limit = limit or settings.SCIELO_BOOKS_LIMIT + current_since = since or 0 + + while True: + payload = fetch_changes_page( + base_url=base_url, + db_name=db_name, + since=current_since, + limit=limit, + include_docs=True, + headers=headers, + ) + changes = extract_changes(payload) + if not changes: + break + + for change in changes: + doc_id = change.get("id") + if not doc_id: + continue + + deleted = bool(change.get("deleted")) + raw_doc = change.get("doc") or {} + if deleted: + yield { + "change": change, + "deleted": True, + "payload": None, + "source_url": None, + } + continue + + if raw_doc: + sanitized = sanitize_raw_data(raw_doc) + yield { + "change": change, + "deleted": False, + "payload": sanitized, + "source_url": f"{(base_url or settings.SCIELO_BOOKS_BASE_URL)}/{db_name}/{doc_id}", + } + continue + + document_payload, source_url = fetch_document( + doc_id=doc_id, + base_url=base_url, + db_name=db_name, + headers=headers, + ) + yield { + "change": change, + "deleted": False, + "payload": document_payload, + "source_url": source_url, + } + + last_seq = extract_last_seq(payload) + if last_seq is None or last_seq == current_since: + break + current_since = last_seq diff --git a/core/models.py b/core/models.py index 1aeab73..2a4ecbf 100644 --- a/core/models.py +++ b/core/models.py @@ -11,7 +11,7 @@ from wagtailautocomplete.edit_handlers import AutocompletePanel from . import choices -from .utils.utils import language_iso +from .utils.standardizer import language_iso User = get_user_model() diff --git a/article/__init__.py b/core/tests/__init__.py similarity index 100% rename from article/__init__.py rename to core/tests/__init__.py diff --git a/core/tests/tests_collectors.py b/core/tests/tests_collectors.py new file mode 100644 index 0000000..6d13a7c --- /dev/null +++ b/core/tests/tests_collectors.py @@ -0,0 +1,55 @@ +import unittest +from unittest.mock import patch + +from core.collectors import scielo_books + + +class SciELOBooksCollectorTests(unittest.TestCase): + def test_build_url_appends_query_params(self): + url = scielo_books.build_url( + "https://books.example/_changes", + {"since": 10, "limit": 100}, + ) + + self.assertEqual(url, "https://books.example/_changes?since=10&limit=100") + + def test_sanitize_raw_data_renames__id(self): + payload = {"_id": "abc123", "TYPE": "Monograph"} + + sanitized = scielo_books.sanitize_raw_data(payload) + + self.assertEqual(sanitized["id"], "abc123") + self.assertNotIn("_id", sanitized) + self.assertEqual(sanitized["TYPE"], "Monograph") + + def test_extract_last_seq_accepts_both_couch_formats(self): + self.assertEqual(scielo_books.extract_last_seq({"last_seq": 123}), 123) + self.assertEqual(scielo_books.extract_last_seq({"seq": 456}), 456) + + @patch("core.collectors.scielo_books.fetch_document") + @patch("core.collectors.scielo_books.fetch_changes_page") + def test_iter_change_documents_uses_docs_from_changes_payload(self, mock_fetch_changes_page, mock_fetch_document): + mock_fetch_changes_page.side_effect = [ + { + "results": [ + { + "seq": 10, + "id": "book1", + "doc": {"_id": "book1", "TYPE": "Monograph", "title": "Book One"}, + } + ], + "last_seq": 10, + }, + {"results": [], "last_seq": 10}, + ] + + results = list(scielo_books.iter_change_documents(base_url="https://books.example", db_name="scielobooks_1a")) + + self.assertEqual(len(results), 1) + self.assertEqual(results[0]["payload"]["id"], "book1") + self.assertEqual(results[0]["payload"]["TYPE"], "Monograph") + mock_fetch_document.assert_not_called() + + +if __name__ == "__main__": + unittest.main() diff --git a/core/tests_date_utils.py b/core/tests/tests_date_utils.py similarity index 87% rename from core/tests_date_utils.py rename to core/tests/tests_date_utils.py index 9f2b657..8d4f9b6 100644 --- a/core/tests_date_utils.py +++ b/core/tests/tests_date_utils.py @@ -86,7 +86,15 @@ def test_extract_minute_second_key(self): key = extract_minute_second_key(dt) self.assertEqual(key, '30:45') + def test_extract_minute_second_key_returns_none_for_invalid_datetime(self): + self.assertIsNone(extract_minute_second_key(None)) + self.assertIsNone(extract_minute_second_key("invalid-date")) + def test_truncate_datetime_to_hour(self): dt = datetime(2023, 3, 15, 14, 30, 45) truncated = truncate_datetime_to_hour(dt) self.assertEqual(truncated, datetime(2023, 3, 15, 14, 0, 0)) + + def test_truncate_datetime_to_hour_returns_none_for_invalid_datetime(self): + self.assertIsNone(truncate_datetime_to_hour(None)) + self.assertIsNone(truncate_datetime_to_hour("invalid-date")) diff --git a/core/tests_standardizer.py b/core/tests_standardizer.py deleted file mode 100644 index a50ff87..0000000 --- a/core/tests_standardizer.py +++ /dev/null @@ -1,201 +0,0 @@ -from django.test import TestCase - -from core.utils import standardizer - - -class StandardizerStandardizeCodeAndNameTest(TestCase): - - def test_standardize_code_and_name_returns_both(self): - expected = [{"code": "CE", "name": "Ceará"}] - text = "Ceará / CE" - result = standardizer.standardize_code_and_name(text) - for i, item in enumerate(result): - with self.subTest(i): - self.assertDictEqual(expected[i], item) - - def test_standardize_code_and_name_returns_acronym(self): - expected = [{"code": "CE", }] - text = "CE" - result = standardizer.standardize_code_and_name(text) - for i, item in enumerate(result): - with self.subTest(i): - self.assertDictEqual(expected[i], item) - - def test_standardize_code_and_name_returns_name(self): - expected = [{"name": "Ceará"}] - text = "Ceará" - result = standardizer.standardize_code_and_name(text) - for i, item in enumerate(result): - with self.subTest(i): - self.assertDictEqual(expected[i], item) - - def test_standardize_code_and_name_returns_more_than_one_both(self): - expected = [{"code": "CE", "name": "Ceará"}, - {"code": "SP", "name": "São Paulo"}] - text = "Ceará / CE, São Paulo / SP" - result = standardizer.standardize_code_and_name(text) - for i, item in enumerate(result): - with self.subTest(i): - self.assertDictEqual(expected[i], item) - - def test_standardize_code_and_name_returns_more_than_one_acronym(self): - expected = [{"code": "CE", }, {"code": "SP", }] - text = "CE / SP" - result = standardizer.standardize_code_and_name(text) - for i, item in enumerate(result): - with self.subTest(i): - self.assertDictEqual(expected[i], item) - - def test_standardize_code_and_name_returns_more_than_one_name(self): - expected = [{"name": "Ceará"}, {"name": "São Paulo"}] - text = "Ceará - São Paulo" - result = standardizer.standardize_code_and_name(text) - for i, item in enumerate(result): - with self.subTest(i): - self.assertDictEqual(expected[i], item) - - -class StandardizerStandardizeNameTest(TestCase): - - def test_standardize_name(self): - expected = ["Txto 1", "Texto 2", "Texto 3"] - text = "Txto 1, Texto 2, Texto 3" - result = standardizer.standardize_name(text) - for i, item in enumerate(result): - with self.subTest(i): - self.assertEqual({"name": expected[i]}, item) - - -class StandardizerStandardizeLanguageCode(TestCase): - def test_standardize_language_code_en_us_is_valid(self): - language_code = 'en-US' - standardized = standardizer.standardize_language_code(language_code) - self.assertEqual(standardized, 'en') - - def test_standardize_language_code_esp_is_valid(self): - language_code = 'esp' - standardized = standardizer.standardize_language_code(language_code) - self.assertEqual(standardized, 'es') - - def test_standardize_language_code_pt_br_is_valid(self): - language_code = 'pt-BR' - standardized = standardizer.standardize_language_code(language_code) - self.assertEqual(standardized, 'pt') - - def test_standardize_language_code_es_is_valid(self): - language_code = 'spa' - standardized = standardizer.standardize_language_code(language_code) - self.assertEqual(standardized, 'es') - - def test_standardize_language_code_en_gb_is_valid(self): - language_code = 'en-GB' - standardized = standardizer.standardize_language_code(language_code) - self.assertEqual(standardized, 'en') - - -class StandardizerStandardizePIDV3(TestCase): - def test_standardize_pid_v3_is_valid(self): - pid_v3 = 'jGJccQ7bFdbz6wy3nfXGVdv' - standardized = standardizer.standardize_pid_v3(pid_v3) - self.assertEqual(standardized, 'jGJccQ7bFdbz6wy3nfXGVdv') - - -class StandardizerStandardizePIDV2(TestCase): - def test_standardize_pid_v2_is_valid(self): - pid_v2 = 'S0102-67202020000100001' - standardized = standardizer.standardize_pid_v2(pid_v2) - self.assertEqual(standardized, 'S0102-67202020000100001') - - -class StandardizerStandardizeDOI(TestCase): - def test_standardize_doi_is_valid(self): - doi = '10.1590/S0102-67202020000100001' - standardized = standardizer.standardize_doi(doi) - self.assertEqual(standardized, '10.1590/S0102-67202020000100001') - - def test_standardize_doi_is_valid_with_doi_prefix(self): - doi = 'doi:10.1590/S0102-67202020000100001' - standardized = standardizer.standardize_doi(doi) - self.assertEqual(standardized, '10.1590/S0102-67202020000100001') - - def test_standardize_doi_is_valid_with_http_prefix(self): - doi = 'http://doi.org/10.1590/S0102-67202020000100001' - standardized = standardizer.standardize_doi(doi) - self.assertEqual(standardized, '10.1590/S0102-67202020000100001') - - def test_standardize_doi_is_valid_with_https_prefix(self): - doi = 'https://doi.org/10.1590/S0102-67202020000100001' - standardized = standardizer.standardize_doi(doi) - self.assertEqual(standardized, '10.1590/S0102-67202020000100001') - - def test_standardize_doi_is_valid_with_doi_prefix_and_http_prefix(self): - doi = 'doi:http://doi.org/10.1590/S0102-67202020000100001' - standardized = standardizer.standardize_doi(doi) - self.assertEqual(standardized, '10.1590/S0102-67202020000100001') - - def test_standardize_doi_is_valid_with_doi_prefix_and_https_prefix(self): - doi = 'doi:https://doi.org/10.1590/S0102-67202020000100001' - standardized = standardizer.standardize_doi(doi) - self.assertEqual(standardized, '10.1590/S0102-67202020000100001') - - -class TestStandardizeYearOfPublication(TestCase): - def test_standardize_year_of_publication_four_digit_year(self): - """Test that a four-digit year is returned as-is""" - year = "2023" - result = standardizer.standardize_year_of_publication(year) - self.assertEqual(result, "2023") - - def test_standardize_year_of_publication_integer_year(self): - """Test that an integer year is converted to string""" - year = 2023 - result = standardizer.standardize_year_of_publication(year) - self.assertEqual(result, "2023") - - def test_standardize_year_of_publication_year_range(self): - """Test that a year range returns the first year""" - year = "2020-2023" - result = standardizer.standardize_year_of_publication(year) - self.assertEqual(result, "2020") - - def test_standardize_year_of_publication_year_with_slash(self): - """Test that a year with slash returns the first year""" - year = "2020/2023" - result = standardizer.standardize_year_of_publication(year) - self.assertEqual(result, "2020") - - def test_standardize_year_of_publication_year_with_extra_text(self): - """Test that year with extra text extracts the year""" - year = "Published in 2023" - result = standardizer.standardize_year_of_publication(year) - self.assertEqual(result, "") - - def test_standardize_year_of_publication_invalid_year(self): - """Test that invalid year returns None or empty string""" - year = "invalid" - result = standardizer.standardize_year_of_publication(year) - self.assertEqual(result, '') - - def test_standardize_year_of_publication_empty_string(self): - """Test that empty string returns None or empty string""" - year = "" - result = standardizer.standardize_year_of_publication(year) - self.assertEqual(result, '') - - def test_standardize_year_of_publication_none_input(self): - """Test that None input returns None""" - year = None - result = standardizer.standardize_year_of_publication(year) - self.assertEqual(result, '') - - def test_standardize_year_of_publication_two_digit_year(self): - """Test that two-digit year is converted to four-digit year""" - year = "23" - result = standardizer.standardize_year_of_publication(year) - self.assertEqual(result, '') - - def test_standardize_year_of_publication_year_with_parentheses(self): - """Test that year in parentheses is extracted""" - year = "(2023)" - result = standardizer.standardize_year_of_publication(year) - self.assertEqual(result, '') diff --git a/metrics/utils/file_utils.py b/core/utils/csv_utils.py similarity index 100% rename from metrics/utils/file_utils.py rename to core/utils/csv_utils.py diff --git a/core/utils/date_utils.py b/core/utils/date_utils.py index 026d434..f20ffea 100644 --- a/core/utils/date_utils.py +++ b/core/utils/date_utils.py @@ -29,7 +29,7 @@ def get_date_obj(date_str: str, format: str = "%Y-%m-%d") -> datetime.date: try: return datetime.strptime(date_str, format).date() except (ValueError, TypeError): - ... + return None def get_date_range_str(from_date_str: str = None, until_date_str: str = None, days_to_go_back: int = None) -> tuple[str, str]: @@ -99,12 +99,9 @@ def truncate_datetime_to_hour(dt): Returns: datetime: The truncated datetime object. """ - if isinstance(dt, str): - try: - dt = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") - except ValueError: - logging.error("Invalid datetime string format. Expected '%Y-%m-%d %H:%M:%S'.") - return None + dt = _coerce_datetime(dt) + if dt is None: + return None return dt.replace(minute=0, second=0, microsecond=0) @@ -119,11 +116,23 @@ def extract_minute_second_key(dt): Returns: str: A string in the format "MM:SS" representing the minute and second. """ + dt = _coerce_datetime(dt) + if dt is None: + return None + + return f"{dt.minute:02}:{dt.second:02}" + + +def _coerce_datetime(dt): + if isinstance(dt, datetime): + return dt + if isinstance(dt, str): try: - dt = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") + return datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") except ValueError: logging.error("Invalid datetime string format. Expected '%Y-%m-%d %H:%M:%S'.") return None - return f"{dt.minute:02}:{dt.second:02}" + logging.error("Invalid datetime value: %r.", dt) + return None diff --git a/core/utils/utils.py b/core/utils/request_utils.py similarity index 90% rename from core/utils/utils.py rename to core/utils/request_utils.py index 0397338..c4fbec6 100644 --- a/core/utils/utils.py +++ b/core/utils/request_utils.py @@ -1,8 +1,6 @@ import logging -import re import requests -from langcodes import standardize_tag, tag_is_valid from tenacity import ( retry, retry_if_exception_type, @@ -17,13 +15,6 @@ User = get_user_model() -def language_iso(code): - code = re.split(r"-|_", code)[0] if code else "" - if tag_is_valid(code): - return standardize_tag(code) - return "" - - class RetryableError(Exception): """Recoverable error without having to modify the data state on the client side, e.g. timeouts, errors from network partitioning, etc. @@ -92,4 +83,4 @@ def _get_user(request, username=None, user_id=None): if user_id: return User.objects.get(pk=user_id) if username: - return User.objects.get(username=username) \ No newline at end of file + return User.objects.get(username=username) diff --git a/core/utils/standardizer.py b/core/utils/standardizer.py index 27b5cba..c228bf5 100644 --- a/core/utils/standardizer.py +++ b/core/utils/standardizer.py @@ -1,247 +1,77 @@ -import langcodes import re - -ITEMS_SEP_FOR_LOCATION = [";", ", ", "|", "/"] -PARTS_SEP_FOR_LOCATION = [" - ", "- ", " -", ", ", "(", "/"] - -ITEMS_SEP_FOR_CITY = [",", "|"] -PARTS_SEP_FOR_CITY = [] - - -def remove_extra_spaces(text): - text = text and text.strip() - if not text: - return text - # padroniza a quantidade de espaços - return " ".join([item.strip() for item in text.split() if item.strip()]) - - -def standardize_code_and_name(original): - """ - Dado o texto original, identifica pares de code e nome. - Os separadores podem separar code e nome e/ou itens de lista. - Ex.: USP / Unicamp - São Paulo/SP, Rio de Janeiro/RJ - """ - text_ = original - text_ = text_ and text_.strip() - if not text_: - return [] - - text_ = remove_extra_spaces(text_) - if not text_: - yield {"name": None} - return - - items_separators = ITEMS_SEP_FOR_LOCATION - parts_separators = PARTS_SEP_FOR_LOCATION - - PARTBR = "~PARTBR~" - LINEBR = "~LINEBR~" - for sep in items_separators: - text_ = text_.replace(sep, PARTBR) - for sep in parts_separators: - text_ = text_.replace(sep, PARTBR) - - codes = [] - names = [] - for item in text_.split(PARTBR): - item = item.strip() - if not item: - continue - if len(item) == 2: - codes.append(item) - else: - names.append(item) - - if len(names) == len(codes): - for acron, name in zip(codes, names): - yield {"code": acron, "name": name} - elif len(names) == 0: - for acron in codes: - yield {"code": acron} - elif len(codes) == 0: - for name in names: - yield {"name": name} - else: - # como o texto está bem fora do padrão, - # pode-se evidenciar retornando o original - yield {"name": original} - - -def standardize_name(original): - original = original and original.strip() - if not original: - return - - items_separators = ITEMS_SEP_FOR_CITY - - LINEBR = "~LINEBR~" - - text_ = original - text_ = remove_extra_spaces(text_) - - for sep in items_separators: - text_ = text_.replace(sep, LINEBR) - - for row in text_.split(LINEBR): - row = row and row.strip() - if not row: - continue - yield {"name": row} +import langcodes def standardize_language_code(language_code: str, threshold=0.75): - """ - Standardizes a media language using langcodes library. - - Parameters: - media_language (str): The media language to be standardized. - threshold (float): The minimum score for a language to be considered valid. Default is 0.75. - - Returns: - str: The standardized media language or None if the input is not a valid language tag. - """ - if not language_code: - return 'un' - - if langcodes.tag_is_valid(language_code): - return langcodes.standardize_tag(language_code).split('-')[0] - - # Handle special cases - if language_code.lower() == 'esp': - return 'es' - - inferred_lang, score = langcodes.best_match(language_code, langcodes.LANGUAGE_ALPHA3.keys()) - - if score >= threshold: - return langcodes.standardize_tag(inferred_lang).split('-')[0] - - # Handle unknown languages - return 'un' + language_code = str(language_code).strip().strip("'\"") + lang = langcodes.get(language_code) + try: + parts = str(lang).split("-") + except Exception: + return "un" + return parts[0] def standardize_pid_v2(pid_v2): - """ - Standardizes a PID v2. - - Parameters: - pid_v2 (str): The PID v2 to be standardized. - - Returns: - str: The standardized PID v2 or an empty string if the input is not a valid PID v2. - """ - if not pid_v2 or not pid_v2.lower().startswith('s') or len(pid_v2) < 23: - return '' - + if not pid_v2 or not pid_v2.lower().startswith("s") or len(pid_v2) < 23: + return "" + if len(pid_v2) == 23: return pid_v2[0].upper() + pid_v2[1:] - + if len(pid_v2) > 23: return pid_v2[0].upper() + pid_v2[1:23] - - if len(pid_v2) < 23: - return '' + return "" -def standardize_pid_v3(pid_v3): - """ - Standardizes a PID v3 using langcodes library." - - Parameters: - pid_v3 (str): The PID v3 to be standardized. - Returns: - str: The standardized PID v3 or an empty string if the input is not a valid PID v3. - """ - - if not pid_v3: - return '' - - if len(pid_v3) == 23: - return pid_v3 - - if len(pid_v3) > 23: - return pid_v3[:23] - - if len(pid_v3) < 23: - return '' +def standardize_pid_v3(pid_v3): + return str(pid_v3 or "") def standardize_doi(text): - """" - Standardizes a DOI. - - Parameters: - text (str): The DOI to be standardized. - - Returns: - str: The standardized DOI - """ - PATTERNS_DOI = [re.compile(pd) for pd in [ - r'10.\d{4,9}/[-._;()/:A-Z0-9]+$', - r'10.1002/[^\s]+$', - r'10.\d{4}/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d$', - r'10.1207/[\w\d]+\&\d+_\d+$', - r'10.\d{4,9}/[-._;()/:a-zA-Z0-9]*'] + text = (text or "").strip() + if not text: + return "" + + doi_prefixes = [ + "https://doi.org/", + "http://doi.org/", + "https://dx.doi.org/", + "http://dx.doi.org/", + "doi.org/", + "dx.doi.org/", + "doi:", ] - matched_doi = False - - for pattern_doi in PATTERNS_DOI: - matched_doi = pattern_doi.search(text) - if matched_doi: + for prefix in doi_prefixes: + if text.lower().startswith(prefix): + text = text[len(prefix):] break - if not matched_doi: - return - - return matched_doi.group().upper() + if text.lower().startswith("10."): + return text + + return "" def standardize_pid_generic(pid_generic): - """ - Standardizes a PID." - - Parameters: - pid_generic (str): The PID to be standardized. - - Returns: - str: The standardized PID or an empty string if the input is not a valid PID. - """ - - if not pid_generic: - return '' - - pid_generic_based_on_doi = standardize_doi(pid_generic) - if pid_generic_based_on_doi: - return pid_generic_based_on_doi - - return pid_generic.strip().upper() + value = str(pid_generic or "").strip().upper() + value = re.sub(r"\s+", "", value) + value = value.rstrip(".,;:") + return value or "" def standardize_year_of_publication(year_of_publication): - """ - Standardizes a year of publication. - - Parameters: - year_of_publication (str): The year of publication to be standardized. - - Returns: - str: The standardized year of publication or an empty string if the input is not a valid year. - """ - if not year_of_publication: - return '' - - # Truncate to 4 characters if longer - if isinstance(year_of_publication, str) and len(year_of_publication) > 4: - year_of_publication = year_of_publication[:4] - - try: - year = int(year_of_publication) - if 1500 <= year <= 2100: - return str(year) - except ValueError: - pass - - return '' + value = str(year_of_publication or "").strip() + if not value: + return "" + match = re.match(r"(\d{4})", value) + return match.group(1) if match else "" + + +def language_iso(code): + code = re.split(r"-|_", code)[0] if code else "" + if langcodes.tag_is_valid(code): + return langcodes.standardize_tag(code) + return "" diff --git a/django_celery_beat/views.py b/django_celery_beat/views.py index 3a4ddb0..b5cff84 100644 --- a/django_celery_beat/views.py +++ b/django_celery_beat/views.py @@ -21,6 +21,13 @@ def task_run(request): task = current_app.tasks.get(p_task.task) + if task is None: + messages.error( + request, + _("Task '{0}' not found in the Celery registry.").format(p_task.task), + ) + return redirect(request.META.get("HTTP_REFERER")) + kwargs = json.loads(p_task.kwargs) kwargs["user_id"] = request.user.id diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index 6957700..0000000 --- a/docs/Makefile +++ /dev/null @@ -1,29 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line, and also -# from the environment for the first two. -SPHINXOPTS ?= -SPHINXBUILD ?= sphinx-build -SOURCEDIR = . -BUILDDIR = ./_build -APP = /app - -.PHONY: help livehtml apidocs Makefile - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -c . - -# Build, watch and serve docs with live reload -livehtml: - sphinx-autobuild -b html --host 0.0.0.0 --port 9000 --watch $(APP) -c . $(SOURCEDIR) $(BUILDDIR)/html - -# Outputs rst files from django application code -apidocs: - sphinx-apidoc -o $(SOURCEDIR)/api $(APP) - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -c . diff --git a/docs/__init__.py b/docs/__init__.py deleted file mode 100644 index 8772c82..0000000 --- a/docs/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Included so that Django's startproject comment runs against the docs directory diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index 51cd921..0000000 --- a/docs/conf.py +++ /dev/null @@ -1,64 +0,0 @@ -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. - -import os -import sys - -import django - -if os.getenv("READTHEDOCS", default=False) == "True": - sys.path.insert(0, os.path.abspath("..")) - os.environ["DJANGO_READ_DOT_ENV_FILE"] = "True" - os.environ["USE_DOCKER"] = "no" -else: - sys.path.insert(0, os.path.abspath("/app")) -os.environ["DATABASE_URL"] = "sqlite:///readthedocs.db" -os.environ["CELERY_BROKER_URL"] = os.getenv("REDIS_URL", "redis://redis:6379") -os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings.local") -django.setup() - -# -- Project information ----------------------------------------------------- - -project = "SciELO Core" -copyright = """2022, SciELO""" -author = "SciELO" - - -# -- General configuration --------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.napoleon", -] - -# Add any paths that contain templates here, relative to this directory. -# templates_path = ["_templates"] - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = "alabaster" - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -# html_static_path = ["_static"] diff --git a/docs/howto.rst b/docs/howto.rst deleted file mode 100644 index 9fae300..0000000 --- a/docs/howto.rst +++ /dev/null @@ -1,38 +0,0 @@ -How To - Project Documentation -====================================================================== - -Get Started ----------------------------------------------------------------------- - -Documentation can be written as rst files in `core/docs`. - - -To build and serve docs, use the commands:: - - docker compose -f local.yml up docs - - - -Changes to files in `docs/_source` will be picked up and reloaded automatically. - -`Sphinx `_ is the tool used to build documentation. - -Docstrings to Documentation ----------------------------------------------------------------------- - -The sphinx extension `apidoc `_ is used to automatically document code using signatures and docstrings. - -Numpy or Google style docstrings will be picked up from project files and availble for documentation. See the `Napoleon `_ extension for details. - -For an in-use example, see the `page source <_sources/users.rst.txt>`_ for :ref:`users`. - -To compile all docstrings automatically into documentation source files, use the command: - :: - - make apidocs - - -This can be done in the docker container: - :: - - docker run --rm docs make apidocs diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index b6c6ded..0000000 --- a/docs/index.rst +++ /dev/null @@ -1,23 +0,0 @@ -.. SciELO Content Manager documentation master file, created by - sphinx-quickstart. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to SciELO Core's documentation! -====================================================================== - -.. toctree:: - :maxdepth: 2 - :caption: Contents: - - howto - users - - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 4f70eed..0000000 --- a/docs/make.bat +++ /dev/null @@ -1,46 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -c . -) -set SOURCEDIR=_source -set BUILDDIR=_build -set APP=..\core - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.Install sphinx-autobuild for live serving. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -b %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:livehtml -sphinx-autobuild -b html --open-browser -p 9000 --watch %APP% -c . %SOURCEDIR% %BUILDDIR%/html -GOTO :EOF - -:apidocs -sphinx-apidoc -o %SOURCEDIR%/api %APP% -GOTO :EOF - -:help -%SPHINXBUILD% -b help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd diff --git a/docs/users.rst b/docs/users.rst deleted file mode 100644 index 21e08aa..0000000 --- a/docs/users.rst +++ /dev/null @@ -1,15 +0,0 @@ - .. _users: - -Users -====================================================================== - -Starting a new project, it’s highly recommended to set up a custom user model, -even if the default User model is sufficient for you. - -This model behaves identically to the default user model, -but you’ll be able to customize it in the future if the need arises. - -.. automodule:: core.users.models - :members: - :noindex: - diff --git a/document/__init__.py b/document/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/document/__init__.py @@ -0,0 +1 @@ + diff --git a/journal/apps.py b/document/apps.py similarity index 62% rename from journal/apps.py rename to document/apps.py index e10a171..eb482d2 100644 --- a/journal/apps.py +++ b/document/apps.py @@ -1,6 +1,6 @@ from django.apps import AppConfig -class JournalConfig(AppConfig): +class DocumentConfig(AppConfig): default_auto_field = "django.db.models.BigAutoField" - name = "journal" + name = "document" diff --git a/document/management/__init__.py b/document/management/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/document/management/__init__.py @@ -0,0 +1 @@ + diff --git a/document/management/commands/__init__.py b/document/management/commands/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/document/management/commands/__init__.py @@ -0,0 +1 @@ + diff --git a/document/management/commands/load_articles_by_year.py b/document/management/commands/load_articles_by_year.py new file mode 100644 index 0000000..a922456 --- /dev/null +++ b/document/management/commands/load_articles_by_year.py @@ -0,0 +1,80 @@ +from django.core.management.base import BaseCommand + +from document.tasks import task_load_documents_from_article_meta +from document.tasks import task_load_documents_from_opac + + +class Command(BaseCommand): + help = "Generate task requests for loading document data by year" + + def add_arguments(self, parser): + parser.add_argument( + "--start-year", + type=int, + default=1990, + help="Start year (default: 1990)", + ) + parser.add_argument( + "--end-year", + type=int, + default=2025, + help="End year (default: 2025)", + ) + parser.add_argument( + "--collection", + type=str, + default="scl", + help="Collection code (default: scl)", + ) + parser.add_argument( + "--task", + choices=["load_documents_from_opac", "load_documents_from_article_meta"], + default="load_documents_from_opac", + help="Task to execute (default: load_documents_from_opac)", + ) + + def handle(self, *args, **options): + start_year = options["start_year"] + end_year = options["end_year"] + collection = options["collection"] + + self.stdout.write( + self.style.SUCCESS( + f"Generating task requests from {start_year} to {end_year} for collection: {collection}" + ) + ) + + total_tasks = 0 + + for year in range(start_year, end_year + 1): + from_date = f"{year}-01-01" + until_date = f"{year}-12-31" + + self.stdout.write(f"Queuing task for year {year}...") + + if options["task"] == "load_documents_from_article_meta": + task_result = task_load_documents_from_article_meta.delay( + from_date=from_date, + until_date=until_date, + collection=collection, + ) + else: + task_result = task_load_documents_from_opac.delay( + from_date=from_date, + until_date=until_date, + collection=collection, + ) + + total_tasks += 1 + + self.stdout.write( + self.style.SUCCESS( + f"✓ Task queued for year {year}: {from_date} to {until_date} (Task ID: {task_result.id})" + ) + ) + + self.stdout.write( + self.style.SUCCESS( + f"\nCompleted! {total_tasks} tasks have been queued successfully." + ) + ) diff --git a/document/migrations/0001_initial.py b/document/migrations/0001_initial.py new file mode 100644 index 0000000..bff11be --- /dev/null +++ b/document/migrations/0001_initial.py @@ -0,0 +1,279 @@ +# Generated by Django 5.0.7 on 2026-03-15 00:00 + +import django.db.models.deletion +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + initial = True + + dependencies = [ + ("collection", "0001_initial"), + ("source", "0001_initial"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name="Document", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "created", + models.DateTimeField( + auto_now_add=True, + verbose_name="Creation date", + ), + ), + ( + "updated", + models.DateTimeField( + auto_now=True, + verbose_name="Last update date", + ), + ), + ( + "document_type", + models.CharField( + choices=[ + ("article", "Article"), + ("preprint", "Preprint"), + ("dataset", "Dataset"), + ("book", "Book"), + ("chapter", "Chapter"), + ("other", "Other"), + ], + db_index=True, + max_length=32, + verbose_name="Document Type", + ), + ), + ( + "document_id", + models.CharField( + db_index=True, + max_length=255, + verbose_name="Document ID", + ), + ), + ( + "scielo_issn", + models.CharField( + blank=True, + db_index=True, + max_length=9, + null=True, + verbose_name="SciELO ISSN", + ), + ), + ( + "pid_v2", + models.CharField( + blank=True, + db_index=True, + max_length=23, + null=True, + verbose_name="PID V2", + ), + ), + ( + "pid_v3", + models.CharField( + blank=True, + db_index=True, + max_length=23, + null=True, + verbose_name="PID V3", + ), + ), + ( + "pid_generic", + models.CharField( + blank=True, + db_index=True, + max_length=255, + null=True, + verbose_name="PID Generic", + ), + ), + ( + "title", + models.CharField( + blank=True, + max_length=500, + null=True, + verbose_name="Document Title", + ), + ), + ( + "identifiers", + models.JSONField( + blank=True, + default=dict, + null=True, + verbose_name="Identifiers", + ), + ), + ( + "files", + models.JSONField( + blank=True, + default=dict, + null=True, + verbose_name="Files", + ), + ), + ( + "default_lang", + models.CharField( + blank=True, + max_length=8, + null=True, + verbose_name="Default Language", + ), + ), + ( + "text_langs", + models.JSONField( + blank=True, + default=list, + null=True, + verbose_name="Text Languages", + ), + ), + ( + "default_media_format", + models.CharField( + blank=True, + max_length=32, + null=True, + verbose_name="Default Media Format", + ), + ), + ( + "processing_date", + models.CharField( + blank=True, + max_length=32, + null=True, + verbose_name="Processing Date", + ), + ), + ( + "publication_date", + models.CharField( + blank=True, + max_length=32, + null=True, + verbose_name="Publication Date", + ), + ), + ( + "publication_year", + models.CharField( + blank=True, + db_index=True, + max_length=4, + null=True, + verbose_name="Publication Year", + ), + ), + ( + "extra_data", + models.JSONField( + blank=True, + default=dict, + null=True, + verbose_name="Extra Data", + ), + ), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="collection.collection", + verbose_name="Collection", + ), + ), + ( + "creator", + models.ForeignKey( + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_creator", + to=settings.AUTH_USER_MODEL, + verbose_name="Creator", + ), + ), + ( + "parent_document", + models.ForeignKey( + blank=True, + db_index=True, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="child_documents", + to="document.document", + verbose_name="Parent Document", + ), + ), + ( + "source", + models.ForeignKey( + blank=True, + db_index=True, + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="documents", + to="source.source", + verbose_name="Source", + ), + ), + ( + "updated_by", + models.ForeignKey( + blank=True, + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_last_mod_user", + to=settings.AUTH_USER_MODEL, + verbose_name="Updater", + ), + ), + ], + options={ + "verbose_name": "Document", + "verbose_name_plural": "Documents", + "unique_together": {("collection", "document_type", "document_id")}, + "indexes": [ + models.Index( + fields=["collection", "document_type"], + name="document_collection_type_idx", + ), + models.Index( + fields=["collection", "scielo_issn"], + name="document_collection_issn_idx", + ), + models.Index( + fields=["collection", "pid_v2"], + name="document_collection_pidv2_idx", + ), + models.Index( + fields=["collection", "pid_generic"], + name="doc_coll_pidgen_idx", + ), + ], + }, + ), + ] diff --git a/document/migrations/__init__.py b/document/migrations/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/document/migrations/__init__.py @@ -0,0 +1 @@ + diff --git a/document/models.py b/document/models.py new file mode 100644 index 0000000..5197692 --- /dev/null +++ b/document/models.py @@ -0,0 +1,258 @@ +from django.db import models +from django.utils.translation import gettext_lazy as _ + +from collection.models import Collection +from core.models import CommonControlField +from source.models import Source + + +class Document(CommonControlField): + DOCUMENT_TYPE_ARTICLE = "article" + DOCUMENT_TYPE_PREPRINT = "preprint" + DOCUMENT_TYPE_DATASET = "dataset" + DOCUMENT_TYPE_BOOK = "book" + DOCUMENT_TYPE_CHAPTER = "chapter" + DOCUMENT_TYPE_OTHER = "other" + DOCUMENT_TYPE_CHOICES = ( + (DOCUMENT_TYPE_ARTICLE, _("Article")), + (DOCUMENT_TYPE_PREPRINT, _("Preprint")), + (DOCUMENT_TYPE_DATASET, _("Dataset")), + (DOCUMENT_TYPE_BOOK, _("Book")), + (DOCUMENT_TYPE_CHAPTER, _("Chapter")), + (DOCUMENT_TYPE_OTHER, _("Other")), + ) + + collection = models.ForeignKey( + Collection, + verbose_name=_("Collection"), + on_delete=models.CASCADE, + blank=False, + null=False, + db_index=True, + ) + + source = models.ForeignKey( + Source, + verbose_name=_("Source"), + on_delete=models.CASCADE, + related_name="documents", + blank=True, + null=True, + db_index=True, + ) + + parent_document = models.ForeignKey( + "self", + verbose_name=_("Parent Document"), + on_delete=models.SET_NULL, + related_name="child_documents", + blank=True, + null=True, + db_index=True, + ) + + document_type = models.CharField( + verbose_name=_("Document Type"), + max_length=32, + choices=DOCUMENT_TYPE_CHOICES, + blank=False, + null=False, + db_index=True, + ) + + document_id = models.CharField( + verbose_name=_("Document ID"), + max_length=255, + blank=False, + null=False, + db_index=True, + ) + + scielo_issn = models.CharField( + verbose_name=_("SciELO ISSN"), + max_length=9, + blank=True, + null=True, + db_index=True, + ) + + pid_v2 = models.CharField( + verbose_name=_("PID V2"), + max_length=23, + blank=True, + null=True, + db_index=True, + ) + + pid_v3 = models.CharField( + verbose_name=_("PID V3"), + max_length=23, + blank=True, + null=True, + db_index=True, + ) + + pid_generic = models.CharField( + verbose_name=_("PID Generic"), + max_length=255, + blank=True, + null=True, + db_index=True, + ) + + title = models.CharField( + verbose_name=_("Document Title"), + max_length=500, + blank=True, + null=True, + ) + + identifiers = models.JSONField( + verbose_name=_("Identifiers"), + null=True, + blank=True, + default=dict, + ) + + files = models.JSONField( + verbose_name=_("Files"), + null=True, + blank=True, + default=dict, + ) + + default_lang = models.CharField( + verbose_name=_("Default Language"), + max_length=8, + blank=True, + null=True, + ) + + text_langs = models.JSONField( + verbose_name=_("Text Languages"), + null=True, + blank=True, + default=list, + ) + + default_media_format = models.CharField( + verbose_name=_("Default Media Format"), + max_length=32, + blank=True, + null=True, + ) + + processing_date = models.CharField( + verbose_name=_("Processing Date"), + max_length=32, + blank=True, + null=True, + ) + + publication_date = models.CharField( + verbose_name=_("Publication Date"), + max_length=32, + blank=True, + null=True, + ) + + publication_year = models.CharField( + verbose_name=_("Publication Year"), + max_length=4, + blank=True, + null=True, + db_index=True, + ) + + extra_data = models.JSONField( + verbose_name=_("Extra Data"), + null=True, + blank=True, + default=dict, + ) + + def __str__(self): + return f"{self.collection.acron3} - {self.document_type} - {self.document_id}" + + @classmethod + def metadata(cls, collection=None): + queryset = cls.objects.select_related("collection", "source").only( + "collection__acron3", + "default_lang", + "default_media_format", + "document_id", + "document_type", + "extra_data", + "files", + "identifiers", + "parent_document__document_id", + "pid_generic", + "pid_v2", + "pid_v3", + "processing_date", + "publication_date", + "publication_year", + "scielo_issn", + "source__scielo_issn", + "source__source_id", + "source__source_type", + "text_langs", + "title", + ) + + if collection: + queryset = queryset.filter(collection=collection) + + for document in queryset.iterator(): + source = document.source + yield { + "collection": document.collection.acron3, + "default_lang": document.default_lang, + "default_media_format": document.default_media_format, + "document_id": document.document_id, + "document_type": document.document_type, + "extra_data": document.extra_data or {}, + "files": document.files or {}, + "identifiers": document.identifiers or {}, + "parent_document_id": ( + document.parent_document.document_id if document.parent_document else None + ), + "pid_generic": document.pid_generic, + "pid_v2": document.pid_v2, + "pid_v3": document.pid_v3, + "processing_date": document.processing_date, + "publication_date": document.publication_date, + "publication_year": document.publication_year, + "scielo_issn": document.scielo_issn or (source.scielo_issn if source else None), + "source_id": source.source_id if source else None, + "source_type": source.source_type if source else None, + "text_langs": document.text_langs or [], + "title": document.title, + } + + class Meta: + verbose_name = _("Document") + verbose_name_plural = _("Documents") + unique_together = ( + "collection", + "document_type", + "document_id", + ) + indexes = [ + models.Index( + fields=["collection", "document_type"], + name="document_collection_type_idx", + ), + models.Index( + fields=["collection", "scielo_issn"], + name="document_collection_issn_idx", + ), + models.Index( + fields=["collection", "pid_v2"], + name="document_collection_pidv2_idx", + ), + models.Index( + fields=["collection", "pid_generic"], + name="doc_coll_pidgen_idx", + ), + ] diff --git a/document/services/__init__.py b/document/services/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/document/services/__init__.py @@ -0,0 +1 @@ + diff --git a/document/services/articles.py b/document/services/articles.py new file mode 100644 index 0000000..09244b3 --- /dev/null +++ b/document/services/articles.py @@ -0,0 +1,166 @@ +from document.models import Document + +from .common import build_document_id, compact_dict, get_existing_document, normalize_langs, normalize_year + + +def upsert_article_document_from_articlemeta( + payload, + collection, + source=None, + user=None, + force_update=True, +): + pid_v2 = payload.get("code") + document_id = build_document_id(pid_v2, payload.get("pid_v3"), payload.get("pid_generic")) + if not document_id: + return None + + document = get_existing_document( + collection, + Document.DOCUMENT_TYPE_ARTICLE, + document_id, + pid_v2, + ) + created = document is None + if created: + document = Document( + collection=collection, + document_type=Document.DOCUMENT_TYPE_ARTICLE, + document_id=document_id, + ) + if user: + document.creator = user + + if created or force_update: + document.source = source + document.parent_document = None + document.scielo_issn = source.scielo_issn if source else None + document.pid_v2 = pid_v2 or document.pid_v2 + document.pid_v3 = payload.get("pid_v3") or document.pid_v3 + document.pid_generic = payload.get("pid_generic") or document.pid_generic + document.title = payload.get("title") or document.title + document.identifiers = _merge_dicts( + document.identifiers, + _build_articlemeta_identifiers(payload, source), + ) + document.files = payload.get("pdfs") or document.files or {} + document.default_lang = payload.get("default_language") or document.default_lang + document.text_langs = normalize_langs(payload.get("text_langs")) + document.default_media_format = document.default_media_format + document.processing_date = payload.get("processing_date") or document.processing_date + document.publication_date = payload.get("publication_date") or document.publication_date + document.publication_year = normalize_year( + payload.get("publication_year"), + fallback_date=document.publication_date, + ) + document.extra_data = _merge_dicts( + document.extra_data, + compact_dict( + { + "provider": "articlemeta", + "issn_codes": payload.get("code_title"), + } + ), + ) + + if user: + document.updated_by = user + + document.save() + return document + + +def upsert_article_document_from_opac( + payload, + collection, + source=None, + user=None, + force_update=True, +): + pid_v2 = payload.get("pid_v2") + pid_v3 = payload.get("pid_v3") + document_id = build_document_id(pid_v2, pid_v3, payload.get("pid_generic")) + if not document_id: + return None + + document = get_existing_document( + collection, + Document.DOCUMENT_TYPE_ARTICLE, + document_id, + pid_v2, + pid_v3, + payload.get("pid_generic"), + ) + created = document is None + if created: + document = Document( + collection=collection, + document_type=Document.DOCUMENT_TYPE_ARTICLE, + document_id=document_id, + ) + if user: + document.creator = user + + if created or force_update: + document.source = source + document.parent_document = None + document.scielo_issn = source.scielo_issn if source else None + document.pid_v2 = pid_v2 or document.pid_v2 + document.pid_v3 = pid_v3 or document.pid_v3 + document.pid_generic = payload.get("pid_generic") or document.pid_generic + document.title = payload.get("title") or document.title + document.identifiers = _merge_dicts( + document.identifiers, + _build_opac_identifiers(payload, source), + ) + document.files = document.files or {} + document.default_lang = payload.get("default_language") or document.default_lang + document.text_langs = normalize_langs(payload.get("text_langs")) or document.text_langs or [] + document.default_media_format = document.default_media_format + document.processing_date = document.processing_date + document.publication_date = payload.get("publication_date") or document.publication_date + document.publication_year = normalize_year( + payload.get("publication_year"), + fallback_date=document.publication_date, + ) + document.extra_data = _merge_dicts( + document.extra_data, + compact_dict( + { + "provider": "opac", + "journal_acronym": payload.get("journal_acronym"), + } + ), + ) + + if user: + document.updated_by = user + + document.save() + return document + + +def _build_articlemeta_identifiers(payload, source): + return compact_dict( + { + "pid_v2": payload.get("code"), + "scielo_issn": source.scielo_issn if source else None, + } + ) + + +def _build_opac_identifiers(payload, source): + return compact_dict( + { + "pid_v2": payload.get("pid_v2"), + "pid_v3": payload.get("pid_v3"), + "scielo_issn": source.scielo_issn if source else None, + "journal_acronym": payload.get("journal_acronym"), + } + ) + + +def _merge_dicts(current, new_values): + merged = dict(current or {}) + merged.update(new_values or {}) + return merged diff --git a/document/services/books.py b/document/services/books.py new file mode 100644 index 0000000..96d92e1 --- /dev/null +++ b/document/services/books.py @@ -0,0 +1,256 @@ +from document.models import Document + + +def build_book_pid_generic(book_id): + if book_id in (None, ""): + return None + return f"book:{book_id}" + + +def build_chapter_pid_generic(book_id, chapter_id): + if book_id in (None, "") or chapter_id in (None, ""): + return None + return f"book:{book_id}/chapter:{chapter_id}" + + +def enrich_part_payload(payload, monograph_payload): + if not monograph_payload: + return payload + + enriched = dict(payload) + enriched["monograph_title"] = monograph_payload.get("title") + enriched["monograph_language"] = monograph_payload.get("language") + enriched["monograph_publication_date"] = monograph_payload.get("publication_date") + enriched["monograph_year"] = monograph_payload.get("year") + enriched["monograph_publisher"] = monograph_payload.get("publisher") + enriched["monograph_isbn"] = monograph_payload.get("isbn") + enriched["monograph_eisbn"] = monograph_payload.get("eisbn") + enriched["monograph_doi_number"] = monograph_payload.get("doi_number") + enriched["monograph_creators"] = monograph_payload.get("creators") + return enriched + + +def upsert_monograph_document( + payload, + collection, + source=None, + user=None, + force_update=True, + source_url=None, + last_seq=None, +): + if payload.get("TYPE") != "Monograph": + return None + + book_id = str(payload.get("id")) + pid_generic = build_book_pid_generic(book_id) + document, created = Document.objects.get_or_create( + collection=collection, + document_type=Document.DOCUMENT_TYPE_BOOK, + document_id=pid_generic, + ) + + if created and user: + document.creator = user + + if created or force_update: + document.source = source + document.parent_document = None + document.scielo_issn = None + document.pid_v2 = None + document.pid_v3 = None + document.pid_generic = pid_generic + document.title = payload.get("title") or book_id + document.identifiers = _build_monograph_identifiers(payload) + document.files = {} + document.default_lang = payload.get("language") or None + document.text_langs = _unique_list(payload.get("language")) + document.default_media_format = None + document.processing_date = None + document.publication_date = payload.get("publication_date") or None + document.publication_year = _normalize_year(payload.get("year")) + document.extra_data = _build_monograph_extra_data( + payload, + source_url=source_url, + last_seq=last_seq, + ) + + if user: + document.updated_by = user + + document.save() + return document + + +def upsert_part_document( + payload, + collection, + source=None, + parent_document=None, + user=None, + force_update=True, + source_url=None, + last_seq=None, +): + if payload.get("TYPE") != "Part": + return None + + book_id = payload.get("monograph") + chapter_id = payload.get("id") + pid_generic = build_chapter_pid_generic(book_id, chapter_id) + document, created = Document.objects.get_or_create( + collection=collection, + document_type=Document.DOCUMENT_TYPE_CHAPTER, + document_id=pid_generic, + ) + + if created and user: + document.creator = user + + if created or force_update: + document.source = source + document.parent_document = parent_document + document.scielo_issn = None + document.pid_v2 = None + document.pid_v3 = None + document.pid_generic = pid_generic + document.title = payload.get("title") or str(chapter_id) + document.identifiers = _build_part_identifiers(payload) + document.files = {} + document.default_lang = ( + payload.get("text_language") + or payload.get("monograph_language") + or None + ) + document.text_langs = _unique_list( + payload.get("text_language") or payload.get("monograph_language") + ) + document.default_media_format = None + document.processing_date = None + document.publication_date = payload.get("monograph_publication_date") or None + document.publication_year = _normalize_year(payload.get("monograph_year")) + document.extra_data = _build_part_extra_data( + payload, + source_url=source_url, + last_seq=last_seq, + ) + + if user: + document.updated_by = user + + document.save() + return document + + +def delete_book_document(collection, book_id): + return Document.objects.filter( + collection=collection, + document_type=Document.DOCUMENT_TYPE_BOOK, + document_id=build_book_pid_generic(book_id), + ).delete() + + +def delete_document_by_raw_id(collection, raw_id): + return Document.objects.filter( + collection=collection, + extra_data__raw_id=str(raw_id), + ).delete() + + +def has_monograph_document_for_raw_id(collection, raw_id): + return Document.objects.filter( + collection=collection, + document_type=Document.DOCUMENT_TYPE_BOOK, + extra_data__raw_id=str(raw_id), + ).exists() + + +def get_monograph_document(collection, book_id): + return Document.objects.filter( + collection=collection, + document_type=Document.DOCUMENT_TYPE_BOOK, + document_id=build_book_pid_generic(book_id), + ).first() + + +def _build_monograph_identifiers(payload): + identifiers = { + "book_id": str(payload.get("id")) if payload.get("id") is not None else None, + "isbn": payload.get("isbn"), + "eisbn": payload.get("eisbn"), + "doi": payload.get("doi_number"), + } + return _compact_dict(identifiers) + + +def _build_part_identifiers(payload): + identifiers = { + "book_id": str(payload.get("monograph")) if payload.get("monograph") is not None else None, + "chapter_id": str(payload.get("id")) if payload.get("id") is not None else None, + "isbn": payload.get("monograph_isbn"), + "eisbn": payload.get("monograph_eisbn"), + "doi": payload.get("doi_number"), + "book_doi": payload.get("monograph_doi_number"), + } + return _compact_dict(identifiers) + + +def _build_monograph_extra_data(payload, source_url=None, last_seq=None): + extra_data = { + "raw_id": str(payload.get("id")) if payload.get("id") is not None else None, + "raw_type": payload.get("TYPE"), + "source_url": source_url, + "last_seq": last_seq, + "visible": payload.get("visible"), + "city": payload.get("city"), + "country": payload.get("country"), + "pages": payload.get("pages"), + "publisher": payload.get("publisher"), + "creators": payload.get("creators"), + "translated_titles": payload.get("translated_titles"), + "translated_synopses": payload.get("translated_synopses"), + "synopsis": payload.get("synopsis"), + } + return _compact_dict(extra_data) + + +def _build_part_extra_data(payload, source_url=None, last_seq=None): + extra_data = { + "raw_id": str(payload.get("id")) if payload.get("id") is not None else None, + "raw_type": payload.get("TYPE"), + "source_url": source_url, + "last_seq": last_seq, + "visible": payload.get("visible"), + "order": payload.get("order"), + "pages": payload.get("pages"), + "creators": payload.get("creators"), + "translated_titles": payload.get("translated_titles"), + "monograph_id": str(payload.get("monograph")) if payload.get("monograph") is not None else None, + "monograph_title": payload.get("monograph_title"), + "monograph_language": payload.get("monograph_language"), + "monograph_publication_date": payload.get("monograph_publication_date"), + "monograph_year": payload.get("monograph_year"), + "monograph_publisher": payload.get("monograph_publisher"), + "monograph_creators": payload.get("monograph_creators"), + } + return _compact_dict(extra_data) + + +def _unique_list(value): + if not value: + return [] + return [value] + + +def _normalize_year(value): + if value in (None, ""): + return None + return str(value)[:4] + + +def _compact_dict(data): + return { + key: value + for key, value in data.items() + if value not in (None, "", [], {}, ()) + } diff --git a/document/services/common.py b/document/services/common.py new file mode 100644 index 0000000..91e103d --- /dev/null +++ b/document/services/common.py @@ -0,0 +1,58 @@ +from document.models import Document + + +def build_document_id(*values): + for value in values: + if value not in (None, ""): + return str(value) + return None + + +def get_existing_document(collection, document_type, *identifiers): + identifiers = [str(value) for value in identifiers if value not in (None, "")] + if not identifiers: + return None + + queryset = Document.objects.filter( + collection=collection, + document_type=document_type, + ) + + for field_name in ("document_id", "pid_v2", "pid_v3", "pid_generic"): + for identifier in identifiers: + document = queryset.filter(**{field_name: identifier}).first() + if document: + return document + + return None + + +def normalize_langs(value): + if not value: + return [] + + if isinstance(value, list): + return [item for item in value if item not in (None, "")] + + if isinstance(value, dict): + return [key for key, enabled in value.items() if enabled] + + return [value] + + +def normalize_year(value, fallback_date=None): + if value not in (None, ""): + return str(value)[:4] + + if fallback_date not in (None, ""): + return str(fallback_date)[:4] + + return None + + +def compact_dict(data): + return { + key: value + for key, value in data.items() + if value not in (None, "", [], {}, ()) + } diff --git a/document/services/datasets.py b/document/services/datasets.py new file mode 100644 index 0000000..2496b20 --- /dev/null +++ b/document/services/datasets.py @@ -0,0 +1,69 @@ +from document.models import Document + +from .common import compact_dict, normalize_year + + +def upsert_dataset_document( + payload, + collection, + user=None, + force_update=True, +): + dataset_doi = payload.get("dataset_doi") + if not dataset_doi: + return None + + document, created = Document.objects.get_or_create( + collection=collection, + document_type=Document.DOCUMENT_TYPE_DATASET, + document_id=dataset_doi, + ) + + if created and user: + document.creator = user + + if created or force_update: + files = dict(document.files or {}) + file_id = payload.get("file_id") + if file_id: + files[str(file_id)] = compact_dict( + { + "name": payload.get("file_name"), + "url": payload.get("file_url"), + "file_persistent_id": payload.get("file_persistent_id"), + } + ) + + document.source = None + document.parent_document = None + document.scielo_issn = None + document.pid_v2 = None + document.pid_v3 = None + document.pid_generic = dataset_doi + document.title = payload.get("title") or document.title + document.identifiers = compact_dict( + { + "dataset_doi": dataset_doi, + } + ) + document.files = files + document.default_lang = document.default_lang + document.text_langs = document.text_langs or [] + document.default_media_format = document.default_media_format + document.processing_date = document.processing_date + document.publication_date = payload.get("dataset_published") or document.publication_date + document.publication_year = normalize_year( + None, + fallback_date=document.publication_date, + ) + document.extra_data = compact_dict( + { + "provider": "dataverse", + } + ) + + if user: + document.updated_by = user + + document.save() + return document diff --git a/document/services/preprints.py b/document/services/preprints.py new file mode 100644 index 0000000..4be89f1 --- /dev/null +++ b/document/services/preprints.py @@ -0,0 +1,58 @@ +from document.models import Document + +from .common import compact_dict, normalize_langs, normalize_year + + +def upsert_preprint_document( + payload, + collection, + user=None, + force_update=True, +): + pid_generic = payload.get("pid_generic") + if not pid_generic: + return None + + document, created = Document.objects.get_or_create( + collection=collection, + document_type=Document.DOCUMENT_TYPE_PREPRINT, + document_id=pid_generic, + ) + + if created and user: + document.creator = user + + if created or force_update: + document.source = None + document.parent_document = None + document.scielo_issn = None + document.pid_v2 = None + document.pid_v3 = None + document.pid_generic = pid_generic + document.title = payload.get("title") or document.title + document.identifiers = compact_dict( + { + "pid_generic": pid_generic, + } + ) + document.files = document.files or {} + document.default_lang = payload.get("default_language") or document.default_lang + document.text_langs = normalize_langs(payload.get("text_langs")) + document.default_media_format = document.default_media_format + document.processing_date = document.processing_date + document.publication_date = payload.get("publication_date") or document.publication_date + document.publication_year = normalize_year( + payload.get("publication_year"), + fallback_date=document.publication_date, + ) + document.extra_data = compact_dict( + { + "provider": "preprints", + } + ) + + if user: + document.updated_by = user + + document.save() + return document diff --git a/document/tasks/__init__.py b/document/tasks/__init__.py new file mode 100644 index 0000000..95a0ba5 --- /dev/null +++ b/document/tasks/__init__.py @@ -0,0 +1,28 @@ +from .articlemeta import ( + load_documents_from_article_meta, + task_load_documents_from_article_meta, +) +from .common import ( + get_latest_scielo_books_last_seq, +) +from .dataverse import ( + load_dataset_metadata_from_dataverse, + task_load_dataset_metadata_into_documents, +) +from .opac import ( + load_documents_from_opac, + task_load_documents_from_opac, +) +from .pipeline import ( + task_daily_metadata_sync_pipeline, +) +from .preprints import ( + load_preprints_from_preprints_api, + task_load_preprints_into_documents, +) +from .scielo_books import ( + load_documents_from_scielo_books, + sync_documents_from_scielo_books, + task_load_documents_from_scielo_books, + task_sync_documents_from_scielo_books, +) diff --git a/document/tasks/articlemeta.py b/document/tasks/articlemeta.py new file mode 100644 index 0000000..75b2689 --- /dev/null +++ b/document/tasks/articlemeta.py @@ -0,0 +1,120 @@ +import logging + +from django.db import DataError +from django.utils.translation import gettext as _ + +from core.collectors import articlemeta as articlemeta_collector +from core.utils import date_utils +from core.utils.request_utils import _get_user +from document.services import articles as article_service +from source.services import journals as journal_service + +from config import celery_app + +from .common import _get_collection + + +def load_documents_from_article_meta( + from_date=None, + until_date=None, + days_to_go_back=None, + collection=None, + issn=None, + force_update=True, + user=None, +): + from_date, until_date = date_utils.get_date_range_str( + from_date, + until_date, + days_to_go_back, + ) + logging.info( + "Loading documents from Article Meta. From: %s, Until: %s, Collection: %s, ISSN: %s", + from_date, + until_date, + collection, + issn, + ) + + offset = 0 + limit = 1000 + while True: + response = articlemeta_collector.fetch_article_counter_dict( + from_date, + until_date, + offset=offset, + limit=limit, + collection=collection, + issn=issn, + ) + objects = response.get("objects") or [] + if not objects: + break + + for payload in objects: + collection_obj = _get_collection(payload.get("collection") or collection) + if not collection_obj: + logging.info( + "Collection not found for payload %s", + payload.get("code"), + ) + continue + + source = journal_service.find_journal_source_by_issns( + collection_obj, + payload.get("code_title"), + ) + if not source: + logging.info( + "Source not found for collection %s and ISSNs %s", + collection_obj.acron3, + payload.get("code_title"), + ) + continue + + try: + article_service.upsert_article_document_from_articlemeta( + payload, + collection=collection_obj, + source=source, + user=user, + force_update=force_update, + ) + except DataError as exc: + logging.error( + "Error saving Document from Article Meta. " + "Collection: %s, Source: %s, PIDv2: %s. Error: %s", + collection_obj, + source.source_id, + payload.get('code'), + exc + ) + continue + + offset += limit + + return True + + +@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (Article Meta)"), timelimit=-1, queue="load") +def task_load_documents_from_article_meta( + self, + from_date=None, + until_date=None, + days_to_go_back=None, + collection=None, + issn=None, + force_update=True, + user_id=None, + username=None, +): + user = _get_user(self.request, username=username, user_id=user_id) + return load_documents_from_article_meta( + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, + collection=collection, + issn=issn, + force_update=force_update, + user=user, + ) diff --git a/document/tasks/common.py b/document/tasks/common.py new file mode 100644 index 0000000..1645918 --- /dev/null +++ b/document/tasks/common.py @@ -0,0 +1,43 @@ +import logging + +from collection.models import Collection +from document.models import Document +from source.models import Source + + +def _get_collection(acronym): + if not acronym: + return None + return Collection.objects.filter(acron3=acronym).first() + + +def get_latest_scielo_books_last_seq(collection="books"): + document_last_seq = _get_latest_last_seq_from_queryset( + Document.objects.filter(collection__acron3=collection).only("extra_data") + ) + source_last_seq = _get_latest_last_seq_from_queryset( + Source.objects.filter( + collection__acron3=collection, + source_type=Source.SOURCE_TYPE_BOOK, + ).only("extra_data") + ) + return max(document_last_seq, source_last_seq) + + +def _get_latest_last_seq_from_queryset(queryset): + latest = 0 + for item in queryset.iterator(): + value = _coerce_last_seq((item.extra_data or {}).get("last_seq")) + if value is not None and value > latest: + latest = value + return latest + + +def _coerce_last_seq(value): + if value in (None, ""): + return None + try: + return int(value) + except (TypeError, ValueError): + logging.warning("Ignoring invalid SciELO Books last_seq value: %r", value) + return None diff --git a/document/tasks/dataverse.py b/document/tasks/dataverse.py new file mode 100644 index 0000000..15618a5 --- /dev/null +++ b/document/tasks/dataverse.py @@ -0,0 +1,80 @@ +import logging + +from django.db import DataError +from django.utils.translation import gettext as _ + +from core.collectors import dataverse as dataverse_collector +from core.utils import date_utils +from core.utils.request_utils import _get_user +from document.services import datasets as dataset_service + +from config import celery_app + +from .common import _get_collection + + +def load_dataset_metadata_from_dataverse( + from_date=None, + until_date=None, + days_to_go_back=None, + force_update=True, + user=None, +): + from_date, until_date = date_utils.get_date_range_str( + from_date, + until_date, + days_to_go_back, + ) + logging.info( + "Loading dataset metadata into documents. From: %s, Until: %s", + from_date, + until_date, + ) + + collection_obj = _get_collection("data") + if not collection_obj: + logging.error("Collection not found: data") + return False + + for payload in dataverse_collector.iter_dataset_metadata(from_date, until_date): + if not payload.get("dataset_doi"): + logging.error("Dataset DOI not found in record: %s", payload) + continue + + try: + dataset_service.upsert_dataset_document( + payload, + collection=collection_obj, + user=user, + force_update=force_update, + ) + except DataError as exc: + logging.error( + "Error saving Dataset Document. Collection: %s, PID: %s. Error: %s", + collection_obj, + payload.get('dataset_doi'), + exc + ) + continue + + return True + + +@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (Dataverse)"), timelimit=-1, queue="load") +def task_load_dataset_metadata_into_documents( + self, + from_date=None, + until_date=None, + days_to_go_back=None, + force_update=True, + user_id=None, + username=None, +): + user = _get_user(self.request, username=username, user_id=user_id) + return load_dataset_metadata_from_dataverse( + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, + force_update=force_update, + user=user, + ) diff --git a/document/tasks/opac.py b/document/tasks/opac.py new file mode 100644 index 0000000..5e1c81e --- /dev/null +++ b/document/tasks/opac.py @@ -0,0 +1,107 @@ +import logging + +from django.db import DataError +from django.utils.translation import gettext as _ + +from core.collectors import opac as opac_collector +from core.utils import date_utils +from core.utils.request_utils import _get_user +from document.services import articles as article_service +from source.services import journals as journal_service + +from config import celery_app + +from .common import _get_collection + + +def load_documents_from_opac( + collection="scl", + from_date=None, + until_date=None, + days_to_go_back=None, + page=1, + force_update=True, + user=None, +): + from_date, until_date = date_utils.get_date_range_str( + from_date, + until_date, + days_to_go_back, + ) + logging.info( + "Loading documents from OPAC. From: %s, Until: %s, Collection: %s", + from_date, + until_date, + collection, + ) + + collection_obj = _get_collection(collection) + if not collection_obj: + logging.error("Collection not found: %s", collection) + return False + + while True: + response = opac_collector.fetch_counter_dict(from_date, until_date, page=page) + documents = response.get("documents") or {} + + for payload in documents.values(): + source = journal_service.find_journal_source_by_acronym( + collection_obj, + payload.get("journal_acronym"), + ) + if not source: + logging.info( + "Source not found for collection %s and acronym %s", + collection_obj.acron3, + payload.get("journal_acronym"), + ) + continue + + try: + article_service.upsert_article_document_from_opac( + payload, + collection=collection_obj, + source=source, + user=user, + force_update=force_update, + ) + except DataError as exc: + logging.error( + "Error saving Document from OPAC. " + "Collection: %s, Source: %s, PIDv2: %s. Error: %s", + collection_obj, + source.source_id, + payload.get('pid_v2'), + exc + ) + continue + + page += 1 + if page > int(response.get("pages", 0)): + break + + return True + + +@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (OPAC)"), timelimit=-1, queue="load") +def task_load_documents_from_opac( + self, + collection="scl", + from_date=None, + until_date=None, + days_to_go_back=None, + page=1, + force_update=True, + user_id=None, + username=None, +): + user = _get_user(self.request, username=username, user_id=user_id) + return load_documents_from_opac( + collection=collection, + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, + page=page, + force_update=force_update, + user=user, + ) diff --git a/document/tasks/pipeline.py b/document/tasks/pipeline.py new file mode 100644 index 0000000..97bef7c --- /dev/null +++ b/document/tasks/pipeline.py @@ -0,0 +1,24 @@ +import logging + +from celery import group +from django.utils.translation import gettext as _ + +from config import celery_app + +from .articlemeta import task_load_documents_from_article_meta +from .dataverse import task_load_dataset_metadata_into_documents +from .opac import task_load_documents_from_opac +from .preprints import task_load_preprints_into_documents +from .scielo_books import task_sync_documents_from_scielo_books + + +@celery_app.task(bind=True, name=_("[Metadata] Daily Sync Routine (Auto)"), queue="load") +def task_daily_metadata_sync_pipeline(self): + logging.info("Starting Daily Metadata Sync Pipeline") + group([ + task_load_documents_from_article_meta.s(), + task_load_documents_from_opac.s(), + task_load_preprints_into_documents.s(), + task_load_dataset_metadata_into_documents.s(), + task_sync_documents_from_scielo_books.s(), + ]).apply_async() diff --git a/document/tasks/preprints.py b/document/tasks/preprints.py new file mode 100644 index 0000000..ee63211 --- /dev/null +++ b/document/tasks/preprints.py @@ -0,0 +1,82 @@ +import logging + +from django.db import DataError +from django.utils.translation import gettext as _ + +from core.collectors import preprints as preprints_collector +from core.utils import date_utils +from core.utils.request_utils import _get_user +from document.services import preprints as preprint_service + +from config import celery_app + +from .common import _get_collection + + +def load_preprints_from_preprints_api( + from_date=None, + until_date=None, + days_to_go_back=None, + force_update=True, + user=None, +): + from_date, until_date = date_utils.get_date_range_str( + from_date, + until_date, + days_to_go_back, + ) + logging.info( + "Loading preprints into documents. From: %s, Until: %s", + from_date, + until_date, + ) + + collection_obj = _get_collection("preprints") + if not collection_obj: + logging.error("Collection not found: preprints") + return False + + for record in preprints_collector.iter_records(from_date, until_date): + payload = preprints_collector.extract_record_data(record) + + if not payload.get("pid_generic"): + logging.error("Preprint ID not found in record: %s", record) + continue + + try: + preprint_service.upsert_preprint_document( + payload, + collection=collection_obj, + user=user, + force_update=force_update, + ) + except DataError as exc: + logging.error( + "Error saving Preprint Document. Collection: %s, PID: %s. Error: %s", + collection_obj, + payload.get('pid_generic'), + exc + ) + continue + + return True + + +@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (Preprints)"), timelimit=-1, queue="load") +def task_load_preprints_into_documents( + self, + from_date=None, + until_date=None, + days_to_go_back=None, + force_update=True, + user_id=None, + username=None, +): + user = _get_user(self.request, username=username, user_id=user_id) + return load_preprints_from_preprints_api( + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, + force_update=force_update, + user=user, + ) diff --git a/document/tasks/scielo_books.py b/document/tasks/scielo_books.py new file mode 100644 index 0000000..ddbd462 --- /dev/null +++ b/document/tasks/scielo_books.py @@ -0,0 +1,247 @@ +import logging + +from django.conf import settings +from django.utils.translation import gettext as _ + +from core.collectors import scielo_books as scielo_books_collector +from core.utils.request_utils import _get_user +from document.services import books as document_books_service +from source.services import books as source_books_service + +from config import celery_app + +from .common import get_latest_scielo_books_last_seq + + +def load_documents_from_scielo_books( + collection="books", + db_name=None, + since=0, + limit=None, + force_update=True, + headers=None, + base_url=None, + user=None, +): + db_name = db_name or settings.SCIELO_BOOKS_DB_NAME + limit = limit or settings.SCIELO_BOOKS_LIMIT + collection_obj = source_books_service.get_books_collection(collection) + monograph_cache = {} + + logging.info( + "Loading documents from SciELO Books. Collection: %s, DB: %s, Since: %s, Limit: %s", + collection, + db_name, + since, + limit, + ) + + for item in scielo_books_collector.iter_change_documents( + base_url=base_url, + db_name=db_name, + since=since, + limit=limit, + headers=headers, + ): + change = item["change"] + raw_id = change.get("id") + + if item["deleted"]: + delete_source = document_books_service.has_monograph_document_for_raw_id( + collection_obj, + raw_id, + ) + document_books_service.delete_document_by_raw_id(collection_obj, raw_id) + if delete_source: + source_books_service.delete_book_source(collection_obj, raw_id) + continue + + payload = item["payload"] or {} + source_url = item.get("source_url") + last_seq = change.get("seq") + + if payload.get("TYPE") == "Monograph": + source = source_books_service.upsert_monograph_source( + payload, + collection=collection_obj, + user=user, + force_update=force_update, + source_url=source_url, + last_seq=last_seq, + ) + document_books_service.upsert_monograph_document( + payload, + collection=collection_obj, + source=source, + user=user, + force_update=force_update, + source_url=source_url, + last_seq=last_seq, + ) + monograph_cache[str(payload.get("id"))] = payload + continue + + if payload.get("TYPE") != "Part": + continue + + monograph_payload = _get_monograph_payload( + payload, + monograph_cache=monograph_cache, + base_url=base_url, + db_name=db_name, + headers=headers, + ) + if not monograph_payload: + logging.warning( + "Skipping part %s because monograph %s could not be loaded.", + payload.get("id"), + payload.get("monograph"), + ) + continue + + source = source_books_service.upsert_monograph_source( + monograph_payload, + collection=collection_obj, + user=user, + force_update=force_update, + source_url=None, + last_seq=last_seq, + ) + parent_document = document_books_service.upsert_monograph_document( + monograph_payload, + collection=collection_obj, + source=source, + user=user, + force_update=force_update, + source_url=None, + last_seq=last_seq, + ) + enriched_payload = document_books_service.enrich_part_payload( + payload, + monograph_payload, + ) + document_books_service.upsert_part_document( + enriched_payload, + collection=collection_obj, + source=source, + parent_document=parent_document, + user=user, + force_update=force_update, + source_url=source_url, + last_seq=last_seq, + ) + + return True + + +def sync_documents_from_scielo_books( + collection="books", + db_name=None, + limit=None, + force_update=True, + headers=None, + base_url=None, + user=None, +): + db_name = db_name or settings.SCIELO_BOOKS_DB_NAME + limit = limit or settings.SCIELO_BOOKS_LIMIT + since = get_latest_scielo_books_last_seq(collection=collection) + logging.info( + "Syncing documents from SciELO Books incrementally. Collection: %s, Since: %s, Limit: %s", + collection, + since, + limit, + ) + return load_documents_from_scielo_books( + collection=collection, + db_name=db_name, + since=since, + limit=limit, + force_update=force_update, + headers=headers, + base_url=base_url, + user=user, + ) + + +@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (SciELO Books - Manual)"), queue="load") +def task_load_documents_from_scielo_books( + self, + collection="books", + db_name=None, + since=0, + limit=None, + force_update=True, + headers=None, + base_url=None, + user_id=None, + username=None, +): + db_name = db_name or settings.SCIELO_BOOKS_DB_NAME + limit = limit or settings.SCIELO_BOOKS_LIMIT + user = _get_user(self.request, username=username, user_id=user_id) + return load_documents_from_scielo_books( + collection=collection, + db_name=db_name, + since=since, + limit=limit, + force_update=force_update, + headers=headers, + base_url=base_url, + user=user, + ) + + +@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (SciELO Books - Incremental)"), queue="load") +def task_sync_documents_from_scielo_books( + self, + collection="books", + db_name=None, + limit=None, + force_update=True, + headers=None, + base_url=None, + user_id=None, + username=None, +): + db_name = db_name or settings.SCIELO_BOOKS_DB_NAME + limit = limit or settings.SCIELO_BOOKS_LIMIT + user = _get_user(self.request, username=username, user_id=user_id) + return sync_documents_from_scielo_books( + collection=collection, + db_name=db_name, + limit=limit, + force_update=force_update, + headers=headers, + base_url=base_url, + user=user, + ) + + +def _get_monograph_payload(payload, monograph_cache, base_url=None, db_name=None, headers=None): + monograph_id = payload.get("monograph") + if not monograph_id: + return None + + monograph_key = str(monograph_id) + if monograph_key in monograph_cache: + return monograph_cache[monograph_key] + + try: + monograph_payload, _ = scielo_books_collector.fetch_document( + doc_id=monograph_id, + base_url=base_url, + db_name=db_name or settings.SCIELO_BOOKS_DB_NAME, + headers=headers, + ) + except Exception as exc: + logging.warning( + "Failed to fetch monograph %s for part %s: %s", + monograph_id, + payload.get("id"), + exc, + ) + return None + + monograph_cache[monograph_key] = monograph_payload + return monograph_payload diff --git a/document/tests.py b/document/tests.py new file mode 100644 index 0000000..14d9bcd --- /dev/null +++ b/document/tests.py @@ -0,0 +1,255 @@ +from django.test import TestCase +from unittest.mock import patch + +from collection.models import Collection +from document import tasks as document_tasks +from source.services import books as source_books_service +from source.models import Source + +from .models import Document +from .services import articles as article_service +from .services import books as books_service +from .services import datasets as dataset_service +from .services import preprints as preprint_service + + +class DocumentMetadataTests(TestCase): + def test_metadata_includes_source_context_and_legacy_identifiers(self): + collection = Collection.objects.create(acron3="scl", acron2="sc") + source = Source.objects.create( + collection=collection, + source_type=Source.SOURCE_TYPE_JOURNAL, + source_id="1234-5678", + scielo_issn="1234-5678", + title="Test Journal", + identifiers={"scielo_issn": "1234-5678"}, + ) + Document.objects.create( + collection=collection, + source=source, + document_type=Document.DOCUMENT_TYPE_ARTICLE, + document_id="S123456782024000100001", + scielo_issn="1234-5678", + pid_v2="S123456782024000100001", + pid_v3="abc123", + title="Test Article", + identifiers={"doi": "10.1590/example"}, + files={"pt": {"path": "/pdf/test.pdf"}}, + default_lang="en", + text_langs=["en", "pt"], + publication_date="2024-01-15", + publication_year="2024", + ) + + metadata = list(Document.metadata(collection=collection)) + + self.assertEqual(len(metadata), 1) + self.assertEqual(metadata[0]["document_type"], Document.DOCUMENT_TYPE_ARTICLE) + self.assertEqual(metadata[0]["document_id"], "S123456782024000100001") + self.assertEqual(metadata[0]["source_type"], Source.SOURCE_TYPE_JOURNAL) + self.assertEqual(metadata[0]["source_id"], "1234-5678") + self.assertEqual(metadata[0]["scielo_issn"], "1234-5678") + + def test_upsert_monograph_and_part_documents_from_books_payload(self): + collection = Collection.objects.create(acron3="books", acron2="bk") + monograph_payload = { + "TYPE": "Monograph", + "id": "abcd1", + "title": "Sample Book", + "isbn": "9788578791889", + "eisbn": "9788578791880", + "doi_number": "10.1234/book", + "language": "pt", + "publication_date": "2024-05-20", + "year": "2024", + "publisher": "SciELO Books", + } + part_payload = { + "TYPE": "Part", + "id": "18", + "monograph": "abcd1", + "title": "Chapter 18", + "text_language": "es", + "order": "18", + } + + source = source_books_service.upsert_monograph_source( + monograph_payload, + collection=collection, + ) + parent_document = books_service.upsert_monograph_document( + monograph_payload, + collection=collection, + source=source, + ) + chapter = books_service.upsert_part_document( + books_service.enrich_part_payload(part_payload, monograph_payload), + collection=collection, + source=source, + parent_document=parent_document, + ) + + self.assertEqual(parent_document.document_type, Document.DOCUMENT_TYPE_BOOK) + self.assertEqual(parent_document.document_id, "book:abcd1") + self.assertEqual(parent_document.pid_generic, "book:abcd1") + self.assertEqual(chapter.document_type, Document.DOCUMENT_TYPE_CHAPTER) + self.assertEqual(chapter.document_id, "book:abcd1/chapter:18") + self.assertEqual(chapter.parent_document, parent_document) + self.assertEqual(chapter.identifiers["book_id"], "abcd1") + self.assertEqual(chapter.default_lang, "es") + + def test_articlemeta_and_opac_upsert_same_document(self): + collection = Collection.objects.create(acron3="scl", acron2="sc") + source = Source.objects.create( + collection=collection, + source_type=Source.SOURCE_TYPE_JOURNAL, + source_id="1234-5678", + scielo_issn="1234-5678", + acronym="testjou", + title="Test Journal", + identifiers={"scielo_issn": "1234-5678"}, + ) + + first = article_service.upsert_article_document_from_articlemeta( + { + "code": "S123456782024000100001", + "title": "Article Title", + "pdfs": {"en": {"url": "/pdf/en.pdf"}}, + "processing_date": "2024-02-10", + "publication_date": "2024-01-15", + "publication_year": "2024", + "default_language": "en", + "text_langs": ["en", "pt"], + "code_title": ["1234-5678"], + }, + collection=collection, + source=source, + ) + second = article_service.upsert_article_document_from_opac( + { + "pid_v2": "S123456782024000100001", + "pid_v3": "S1234-56782024000100001", + "title": "Article Title", + "journal_acronym": "testjou", + "publication_date": "2024-01-15", + "default_language": "en", + "text_langs": ["en", "pt"], + }, + collection=collection, + source=source, + ) + + self.assertEqual(first.pk, second.pk) + self.assertEqual(Document.objects.count(), 1) + second.refresh_from_db() + self.assertEqual(second.pid_v3, "S1234-56782024000100001") + self.assertEqual(second.identifiers["journal_acronym"], "testjou") + + def test_upsert_preprint_document_maps_metadata(self): + collection = Collection.objects.create(acron3="preprints", acron2="pp") + + document = preprint_service.upsert_preprint_document( + { + "pid_generic": "preprint/123", + "title": "Preprint Title", + "text_langs": ["en", "pt"], + "default_language": "en", + "publication_date": "2024-01-20", + "publication_year": "2024", + }, + collection=collection, + ) + + self.assertEqual(document.document_type, Document.DOCUMENT_TYPE_PREPRINT) + self.assertEqual(document.document_id, "preprint/123") + self.assertEqual(document.pid_generic, "preprint/123") + self.assertEqual(document.default_lang, "en") + + def test_upsert_dataset_document_accumulates_files(self): + collection = Collection.objects.create(acron3="data", acron2="dt") + + dataset_service.upsert_dataset_document( + { + "title": "Dataset Title", + "dataset_doi": "10.1234/dataset", + "dataset_published": "2024-03-15", + "file_id": "1", + "file_name": "first.csv", + "file_url": "https://example.org/first.csv", + "file_persistent_id": "pid:first", + }, + collection=collection, + ) + document = dataset_service.upsert_dataset_document( + { + "title": "Dataset Title", + "dataset_doi": "10.1234/dataset", + "dataset_published": "2024-03-15", + "file_id": "2", + "file_name": "second.csv", + "file_url": "https://example.org/second.csv", + "file_persistent_id": "pid:second", + }, + collection=collection, + ) + + self.assertEqual(document.document_type, Document.DOCUMENT_TYPE_DATASET) + self.assertEqual(document.document_id, "10.1234/dataset") + self.assertEqual(set(document.files.keys()), {"1", "2"}) + + +class DocumentBooksSyncTests(TestCase): + def test_get_latest_scielo_books_last_seq_uses_documents_and_sources(self): + collection = Collection.objects.create(acron3="books", acron2="bk") + source = Source.objects.create( + collection=collection, + source_type=Source.SOURCE_TYPE_BOOK, + source_id="book-1", + title="Book 1", + extra_data={"last_seq": 120}, + ) + Document.objects.create( + collection=collection, + source=source, + document_type=Document.DOCUMENT_TYPE_BOOK, + document_id="book:book-1", + extra_data={"last_seq": "135"}, + ) + + self.assertEqual(document_tasks.get_latest_scielo_books_last_seq("books"), 135) + + def test_sync_documents_from_scielo_books_uses_computed_since(self): + collection = Collection.objects.create(acron3="books", acron2="bk") + source = Source.objects.create( + collection=collection, + source_type=Source.SOURCE_TYPE_BOOK, + source_id="book-1", + title="Book 1", + extra_data={"last_seq": 120}, + ) + Document.objects.create( + collection=collection, + source=source, + document_type=Document.DOCUMENT_TYPE_BOOK, + document_id="book:book-1", + extra_data={"last_seq": 135}, + ) + + with patch("document.tasks.scielo_books.load_documents_from_scielo_books", return_value=True) as mocked: + result = document_tasks.sync_documents_from_scielo_books( + collection="books", + db_name="scielobooks_1a", + limit=500, + ) + + self.assertTrue(result) + mocked.assert_called_once_with( + collection="books", + db_name="scielobooks_1a", + since=135, + limit=500, + force_update=True, + headers=None, + base_url=None, + user=None, + ) diff --git a/article/wagtail_hooks.py b/document/wagtail_hooks.py similarity index 50% rename from article/wagtail_hooks.py rename to document/wagtail_hooks.py index 4cf55bd..de291c9 100644 --- a/article/wagtail_hooks.py +++ b/document/wagtail_hooks.py @@ -1,39 +1,35 @@ from django.utils.translation import gettext_lazy as _ from wagtail.snippets.views.snippets import SnippetViewSet -from wagtail.snippets.models import register_snippet -from config.menu import get_menu_order +from .models import Document -from .models import Article - -class ArticleSnippetViewSet(SnippetViewSet): - model = Article +class DocumentSnippetViewSet(SnippetViewSet): + model = Document icon = "folder-open-inverse" - menu_name = "article" - menu_label = _("Article") - menu_order = get_menu_order("article") - add_to_admin_menu = True + menu_label = _("Document") + menu_order = 300 list_display = ( "collection", - "scielo_issn", + "document_type", + "document_id", + "source", + "title", "pid_v2", "pid_v3", "pid_generic", - "files", "publication_year", ) list_filter = ( "collection", - "scielo_issn", + "document_type", "publication_year", ) search_fields = ( - "scielo_issn", + "document_id", + "title", "pid_v2", "pid_v3", "pid_generic", ) - -register_snippet(ArticleSnippetViewSet) diff --git a/journal/__init__.py b/journal/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/journal/admin.py b/journal/admin.py deleted file mode 100644 index 8c38f3f..0000000 --- a/journal/admin.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.contrib import admin - -# Register your models here. diff --git a/journal/migrations/0001_initial.py b/journal/migrations/0001_initial.py deleted file mode 100644 index 7164bbc..0000000 --- a/journal/migrations/0001_initial.py +++ /dev/null @@ -1,122 +0,0 @@ -# Generated by Django 5.0.7 on 2025-02-07 17:50 - -import django.db.models.deletion -from django.conf import settings -from django.db import migrations, models - - -class Migration(migrations.Migration): - initial = True - - dependencies = [ - ("collection", "0001_initial"), - migrations.swappable_dependency(settings.AUTH_USER_MODEL), - ] - - operations = [ - migrations.CreateModel( - name="Journal", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "created", - models.DateTimeField( - auto_now_add=True, verbose_name="Creation date" - ), - ), - ( - "updated", - models.DateTimeField( - auto_now=True, verbose_name="Last update date" - ), - ), - ( - "scielo_issn", - models.CharField(max_length=9, verbose_name="SciELO ISSN"), - ), - ( - "issns", - models.JSONField( - blank=True, default=dict, null=True, verbose_name="ISSNs" - ), - ), - ( - "acronym", - models.CharField( - blank=True, - default="", - max_length=32, - null=True, - verbose_name="Journal Acronym", - ), - ), - ( - "title", - models.CharField(max_length=255, verbose_name="Journal Title"), - ), - ( - "publisher_name", - models.JSONField( - blank=True, - default=list, - null=True, - verbose_name="Publisher Name", - ), - ), - ( - "subject_areas", - models.JSONField( - default=list, verbose_name="Subject Areas (CAPES)" - ), - ), - ( - "wos_subject_areas", - models.JSONField(default=list, verbose_name="Subject Areas (WoS)"), - ), - ( - "collection", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - to="collection.collection", - verbose_name="Collection", - ), - ), - ( - "creator", - models.ForeignKey( - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_creator", - to=settings.AUTH_USER_MODEL, - verbose_name="Creator", - ), - ), - ( - "updated_by", - models.ForeignKey( - blank=True, - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_last_mod_user", - to=settings.AUTH_USER_MODEL, - verbose_name="Updater", - ), - ), - ], - options={ - "verbose_name": "Journal", - "verbose_name_plural": "Journals", - "unique_together": {("collection", "scielo_issn", "acronym")}, - }, - ), - ] diff --git a/journal/migrations/0002_alter_journal_scielo_issn.py b/journal/migrations/0002_alter_journal_scielo_issn.py deleted file mode 100644 index 07cf94f..0000000 --- a/journal/migrations/0002_alter_journal_scielo_issn.py +++ /dev/null @@ -1,19 +0,0 @@ -# Generated by Django 5.0.7 on 2025-06-12 17:16 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("journal", "0001_initial"), - ] - - operations = [ - migrations.AlterField( - model_name="journal", - name="scielo_issn", - field=models.CharField( - db_index=True, max_length=9, verbose_name="SciELO ISSN" - ), - ), - ] diff --git a/journal/migrations/__init__.py b/journal/migrations/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/journal/models.py b/journal/models.py deleted file mode 100644 index 0d830e9..0000000 --- a/journal/models.py +++ /dev/null @@ -1,100 +0,0 @@ -from django.db import models -from django.utils.translation import gettext_lazy as _ - -from core.models import CommonControlField -from collection.models import Collection - - -class Journal(CommonControlField): - collection = models.ForeignKey( - Collection, - verbose_name=_('Collection'), - on_delete=models.CASCADE, - blank=False, - null=False, - db_index=True, - ) - - scielo_issn = models.CharField( - verbose_name=_('SciELO ISSN'), - max_length=9, - blank=False, - null=False, - db_index=True, - ) - - issns = models.JSONField( - verbose_name=_('ISSNs'), - null=True, - blank=True, - default=dict, - ) - - acronym = models.CharField( - verbose_name=_('Journal Acronym'), - max_length=32, - blank=True, - null=True, - default='', - ) - - title = models.CharField( - verbose_name=_('Journal Title'), - max_length=255, - blank=False, - null=False, - ) - - publisher_name = models.JSONField( - verbose_name=_('Publisher Name'), - blank=True, - null=True, - default=list, - ) - - subject_areas = models.JSONField( - verbose_name=_('Subject Areas (CAPES)'), - null=False, - blank=False, - default=list, - ) - - wos_subject_areas = models.JSONField( - verbose_name=_('Subject Areas (WoS)'), - null=False, - blank=False, - default=list, - ) - - def __str__(self): - return f'{self.collection.acron2} - {self.scielo_issn} - {self.acronym}' - - @classmethod - def metadata(cls, collection=None): - queryset = cls.objects.all() - if collection: - queryset = queryset.filter(collection=collection) - - for journal in queryset.only( - 'acronym', 'collection__acron3', 'issns', 'publisher_name', - 'scielo_issn', 'subject_areas', 'title', 'wos_subject_areas' - ): - yield { - 'acronym': journal.acronym, - 'collection': journal.collection.acron3, - 'issns': set([v for v in journal.issns.values() if v]), - 'publisher_name': journal.publisher_name, - 'scielo_issn': journal.scielo_issn, - 'subject_areas': journal.subject_areas, - 'title': journal.title, - 'wos_subject_areas': journal.wos_subject_areas, - } - - class Meta: - verbose_name = _('Journal') - verbose_name_plural = _('Journals') - unique_together = ( - 'collection', - 'scielo_issn', - 'acronym', - ) diff --git a/journal/tasks.py b/journal/tasks.py deleted file mode 100644 index 71681cb..0000000 --- a/journal/tasks.py +++ /dev/null @@ -1,56 +0,0 @@ -import logging - -from django.contrib.auth import get_user_model -from django.db import IntegrityError -from django.utils import timezone -from django.utils.translation import gettext as _ - -from collection.models import Collection -from config import celery_app -from core.utils.utils import _get_user - -from . import models, utils - - -User = get_user_model() - - -@celery_app.task(bind=True, name=_('Load journal data from Article Meta'), queue='load') -def task_load_journal_data_from_article_meta(self, collections=[], force_update=True, user_id=None, username=None, mode='thrift'): - user = _get_user(user_id, username) - - for col in collections or Collection.acron3_list(): - for j in utils.fetch_article_meta_journals(collection=col, mode=mode): - collection = Collection.objects.get(acron3=j.collection_acronym) - if not collection: - logging.error(f'Collection {j.collection_acronym} does not exist') - continue - - try: - journal, created = models.Journal.objects.get_or_create(collection=collection, scielo_issn=j.scielo_issn) - except IntegrityError as e: - logging.error(f'Journal {j} has not been created due to error: {e}') - continue - - if created: - journal.creator = user - journal.created = timezone.now() - - if created or force_update: - journal.updated_by = user - journal.updated = timezone.now() - journal.issns = { - 'electronic_issn': j.electronic_issn or '', - 'print_issn': j.print_issn or '', - 'scielo_issn': j.scielo_issn - } - journal.acronym = j.acronym - journal.title = j.title - journal.publisher_name = j.publisher_name or '' - journal.subject_areas = j.subject_areas or [] - journal.wos_subject_areas = j.wos_subject_areas or [] - logging.info(f'Journal {"created" if created else "updated"}: {journal}') - - journal.save() - - return True diff --git a/journal/tests.py b/journal/tests.py deleted file mode 100644 index 7ce503c..0000000 --- a/journal/tests.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.test import TestCase - -# Create your tests here. diff --git a/journal/utils.py b/journal/utils.py deleted file mode 100644 index 8a80521..0000000 --- a/journal/utils.py +++ /dev/null @@ -1,19 +0,0 @@ -from articlemeta.client import ThriftClient, RestfulClient - - -def fetch_article_meta_journals(collection='scl', mode='rest'): - """ - Fetches article metadata from journals. - - Returns - ------- - list - A list of article metadata. - """ - if mode == 'rest': - am = RestfulClient() - elif mode == 'thrift': - am = ThriftClient() - - for j in am.journals(collection=collection): - yield j diff --git a/journal/views.py b/journal/views.py deleted file mode 100644 index 91ea44a..0000000 --- a/journal/views.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.shortcuts import render - -# Create your views here. diff --git a/journal/wagtail_hooks.py b/journal/wagtail_hooks.py deleted file mode 100644 index 725b370..0000000 --- a/journal/wagtail_hooks.py +++ /dev/null @@ -1,40 +0,0 @@ -from django.utils.translation import gettext_lazy as _ -from wagtail.snippets.views.snippets import SnippetViewSet -from wagtail.snippets.models import register_snippet - -from config.menu import get_menu_order - -from .models import Journal - - -class JournalSnippetViewSet(SnippetViewSet): - model = Journal - icon = "folder-open-inverse" - menu_name = "journal" - menu_label = _("Journal") - menu_order = get_menu_order('journal') - add_to_admin_menu = True - - list_display = ( - "collection", - "scielo_issn", - "acronym", - "title", - "issns", - "publisher_name", - "subject_areas", - "wos_subject_areas", - ) - list_filter = ( - "collection", - ) - search_fields = ( - "issns", - "acronym", - "publisher_name", - "subject_areas", - "wos_subject_areas", - ) - - -register_snippet(JournalSnippetViewSet) diff --git a/local.yml b/local.yml index 3c25357..9b3a047 100644 --- a/local.yml +++ b/local.yml @@ -11,10 +11,15 @@ services: - mailhog volumes: - .:/app:z - - ../scms_data/scielo_usage/data/logs:/app/logs + - /mnt/pidata2/pi/scl/logs:/app/logs + # Uncomment to use local SciELO lib repos for development: + # - ../scielo_log_validator:/app/scielo_log_validator:z + # - ../scielo_usage_counter:/app/scielo_usage_counter:z env_file: - ./.envs/.local/.django - ./.envs/.local/.postgres + environment: + - USE_LOCAL_SCIELO_LIBS=0 ports: - "8009:8000" command: /start @@ -40,7 +45,7 @@ services: - "8029:8025" redis: - image: redis:6 + image: redis:8 container_name: scielo_usage_local_redis ports: - "6399:6379" diff --git a/log_manager/choices.py b/log_manager/choices.py index e98c8f2..c6e461a 100644 --- a/log_manager/choices.py +++ b/log_manager/choices.py @@ -19,13 +19,3 @@ (LOG_FILE_STATUS_IGNORED, _("Ignored")), ] - -COLLECTION_LOG_FILE_DATE_COUNT_OK = 'OK' -COLLECTION_LOG_FILE_DATE_COUNT_MISSING_FILES = 'MIS' -COLLECTION_LOG_FILE_DATE_COUNT_EXTRA_FILES = 'EXT' - -COLLECTION_LOG_FILE_DATE_COUNT = [ - (COLLECTION_LOG_FILE_DATE_COUNT_OK, _("OK")), - (COLLECTION_LOG_FILE_DATE_COUNT_MISSING_FILES, _("Missing Files")), - (COLLECTION_LOG_FILE_DATE_COUNT_EXTRA_FILES, _("Extra files")), -] diff --git a/log_manager/management/__init__.py b/log_manager/management/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/log_manager/management/__init__.py @@ -0,0 +1 @@ + diff --git a/log_manager/management/commands/__init__.py b/log_manager/management/commands/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/log_manager/management/commands/__init__.py @@ -0,0 +1 @@ + diff --git a/log_manager/management/commands/reset_log_catalog.py b/log_manager/management/commands/reset_log_catalog.py new file mode 100644 index 0000000..5ded576 --- /dev/null +++ b/log_manager/management/commands/reset_log_catalog.py @@ -0,0 +1,94 @@ +from django.core.management.base import BaseCommand +from django.db import transaction + +from log_manager.models import LogFile +from metrics.models import DailyMetricJob +from metrics.services import daily_payloads +from reports.models import MonthlyLogReport, WeeklyLogReport, YearlyLogReport +from tracker.models import LogFileDiscardedLine + + +class Command(BaseCommand): + help = ( + "Clear the log catalog stored in the database, including derived parsing " + "records, daily metric payloads, and optionally reports, " + "while preserving the source log files on disk." + ) + + def add_arguments(self, parser): + parser.add_argument( + "--collection", + action="append", + dest="collections", + help="Collection acronym to limit cleanup. Repeat the option for multiple collections.", + ) + parser.add_argument( + "--reports", + action="store_true", + default=False, + help="Also clear Weekly/Monthly/Yearly log reports for the selected collections.", + ) + + def handle(self, *args, **options): + collections = options.get("collections") or [] + clear_reports = options.get("reports") + + log_files = LogFile.objects.all() + if collections: + log_files = log_files.filter(collection__acron3__in=collections) + + log_file_ids = list(log_files.values_list("id", flat=True)) + if not log_file_ids: + self.stdout.write(self.style.WARNING("No log catalog rows found for cleanup.")) + return + + daily_jobs = DailyMetricJob.objects.all() + if collections: + daily_jobs = daily_jobs.filter(collection__acron3__in=collections) + payload_paths = list(daily_jobs.exclude(storage_path="").values_list("storage_path", flat=True)) + + summary = { + "log_files": len(log_file_ids), + "discarded_lines": LogFileDiscardedLine.objects.filter( + log_file_id__in=log_file_ids + ).count(), + "daily_metric_jobs": daily_jobs.count(), + } + + for storage_path in payload_paths: + daily_payloads.delete_payload(storage_path) + + with transaction.atomic(): + LogFileDiscardedLine.objects.filter(log_file_id__in=log_file_ids).delete() + daily_jobs.delete() + LogFile.objects.filter(id__in=log_file_ids).delete() + + if clear_reports: + report_qs = WeeklyLogReport.objects.all() + m_qs = MonthlyLogReport.objects.all() + y_qs = YearlyLogReport.objects.all() + if collections: + report_qs = report_qs.filter(collection__acron3__in=collections) + m_qs = m_qs.filter(collection__acron3__in=collections) + y_qs = y_qs.filter(collection__acron3__in=collections) + summary["weekly_reports"] = report_qs.count() + summary["monthly_reports"] = m_qs.count() + summary["yearly_reports"] = y_qs.count() + report_qs.delete() + m_qs.delete() + y_qs.delete() + + msg = ( + f"Cleared log catalog: " + f"{summary['log_files']} log files, " + f"{summary['discarded_lines']} discarded lines, " + f"{summary['daily_metric_jobs']} daily metric jobs." + ) + if clear_reports: + msg += ( + f" Also cleared reports: " + f"{summary['weekly_reports']} weekly, " + f"{summary['monthly_reports']} monthly, " + f"{summary['yearly_reports']} yearly." + ) + self.stdout.write(self.style.SUCCESS(msg)) diff --git a/log_manager/migrations/0010_alter_logfiledate_unique_together_and_more.py b/log_manager/migrations/0010_alter_logfiledate_unique_together_and_more.py new file mode 100644 index 0000000..d30cdf4 --- /dev/null +++ b/log_manager/migrations/0010_alter_logfiledate_unique_together_and_more.py @@ -0,0 +1,52 @@ +# Generated by Django 5.2.12 on 2026-05-01 22:23 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("log_manager", "0009_collectionlogfiledatecount_exported_files_count"), + ] + + operations = [ + migrations.RemoveField( + model_name="logfiledate", + name="creator", + ), + migrations.RemoveField( + model_name="logfiledate", + name="log_file", + ), + migrations.RemoveField( + model_name="logfiledate", + name="updated_by", + ), + migrations.RemoveField( + model_name="logfile", + name="creator", + ), + migrations.RemoveField( + model_name="logfile", + name="updated_by", + ), + migrations.AddField( + model_name="logfile", + name="date", + field=models.DateField( + blank=True, db_index=True, null=True, verbose_name="Date" + ), + ), + migrations.AddField( + model_name="logfile", + name="parse_heartbeat_at", + field=models.DateTimeField( + blank=True, null=True, verbose_name="Parse Heartbeat At" + ), + ), + migrations.DeleteModel( + name="CollectionLogFileDateCount", + ), + migrations.DeleteModel( + name="LogFileDate", + ), + ] diff --git a/log_manager/models.py b/log_manager/models.py index fc3a8b6..6bf04d8 100644 --- a/log_manager/models.py +++ b/log_manager/models.py @@ -1,209 +1,20 @@ import logging -from django.db import models -from django.db.models import Q +from django.db import IntegrityError, models from django.utils import timezone from django.utils.translation import gettext_lazy as _ from wagtail.admin.panels import FieldPanel from wagtailautocomplete.edit_handlers import AutocompletePanel from collection.models import Collection -from core.forms import CoreAdminModelForm -from core.models import CommonControlField from . import choices -class LogFileDate(CommonControlField): - date = models.DateField( - verbose_name=_("Date"), - null=False, - blank=False, - db_index=True, - ) - - log_file = models.ForeignKey( - 'LogFile', - verbose_name=_('Log File'), - blank=True, - on_delete=models.DO_NOTHING, - db_index=True, - ) - - base_form_class = CoreAdminModelForm - - panel = [ - FieldPanel('date'), - AutocompletePanel('log_file') - ] - - class Meta: - ordering = ['-date'] - verbose_name = _("Log File Date") - verbose_name_plural = _("Log File Dates") - unique_together = ( - 'date', - 'log_file', - ) - indexes = [ - models.Index(fields=['date', 'log_file']), - ] - - @classmethod - def create_or_update(cls, user, log_file, date): - obj, created = cls.objects.get_or_create( - log_file=log_file, - date=date, - ) - - if not created: - obj.updated_by = user - obj.updated = timezone.now() - else: - obj.creator = user - obj.created = timezone.now() - - return obj - - @classmethod - def filter_by_collection_and_date(cls, collection, date): - return cls.objects.filter( - ~Q(log_file__status__in=[ - choices.LOG_FILE_STATUS_CREATED, - choices.LOG_FILE_STATUS_INVALIDATED - ]), - log_file__collection__acron3=collection, - date=date, - ) - - @classmethod - def get_number_of_found_files_for_date(cls, collection, date): - return cls.objects.filter( - ~Q(log_file__status__in=[ - choices.LOG_FILE_STATUS_CREATED, - choices.LOG_FILE_STATUS_INVALIDATED - ]), - log_file__collection__acron3=collection, - date=date, - ).count() - - def __str__(self): - return f'{self.log_file.path}-{self.date}' - - -class CollectionLogFileDateCount(CommonControlField): - collection = models.ForeignKey( - Collection, - verbose_name=_('Collection'), - on_delete=models.DO_NOTHING, - null=False, - blank=False, - ) - - date = models.DateField( - _('Date'), - null=False, - blank=False, - ) - - year = models.IntegerField( - _('Year'), - null=False, - blank=False, - ) - - month = models.IntegerField( - _('Month'), - null=False, - blank=False, - ) - - found_log_files = models.IntegerField( - verbose_name=_('Number of Found Valid Log Files'), - default=0, - ) - - expected_log_files = models.IntegerField( - verbose_name=_('Number of Expected Valid Log Files'), - blank=True, - null=True, - ) - - is_usage_metric_computed = models.BooleanField( - verbose_name=_('Is Usage Metric Computed'), - default=False, - ) - - exported_files_count = models.SmallIntegerField( - verbose_name=_('Exported Files Count'), - default=0, - ) - - status = models.CharField( - verbose_name=_('Status'), - choices=choices.COLLECTION_LOG_FILE_DATE_COUNT, - max_length=3, - ) - - def set_status(self): - if self.found_log_files < self.expected_log_files: - self.status = choices.COLLECTION_LOG_FILE_DATE_COUNT_MISSING_FILES - elif self.found_log_files > self.expected_log_files: - self.status = choices.COLLECTION_LOG_FILE_DATE_COUNT_EXTRA_FILES - else: - self.status = choices.COLLECTION_LOG_FILE_DATE_COUNT_OK - - def set_is_usage_metric_computed(self): - if self.exported_files_count == self.found_log_files: - self.is_usage_metric_computed = True - - @classmethod - def create_or_update(cls, user, collection, date, expected_log_files, found_log_files): - obj, created = cls.objects.get_or_create( - collection=collection, - date=date, - month=date.month, - year=date.year, - ) - - if not created: - obj.updated_by = user - obj.updated = timezone.now() - else: - obj.creator = user - obj.created = timezone.now() - - obj.expected_log_files = expected_log_files - obj.found_log_files = found_log_files - obj.set_status() - - obj.save() - return obj - - class Meta: - ordering = ['-date'] - verbose_name = _("Collection Log File Date Count") - unique_together = ( - 'collection', - 'date', - ) - - panels = [ - AutocompletePanel('collection'), - FieldPanel('date'), - FieldPanel('year'), - FieldPanel('month'), - FieldPanel('found_log_files'), - FieldPanel('expected_log_files'), - FieldPanel('status'), - FieldPanel('is_usage_metric_computed'), - ] - - def __str__(self): - return f'{self.collection.acron3}-{self.date}' - - -class LogFile(CommonControlField): +class LogFile(models.Model): + created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True) + updated = models.DateTimeField(verbose_name=_("Last update date"), auto_now=True) + date = models.DateField(verbose_name=_("Date"), null=True, blank=True, db_index=True) hash = models.CharField(_("Hash MD5"), max_length=32, null=True, blank=True, unique=True) path = models.CharField(_("Name"), max_length=255, null=False, blank=False) @@ -246,19 +57,25 @@ class LogFile(CommonControlField): default=0, ) + parse_heartbeat_at = models.DateTimeField( + _("Parse Heartbeat At"), + null=True, + blank=True, + ) + panels = [ FieldPanel('hash'), + FieldPanel('date'), FieldPanel('path'), FieldPanel('stat_result'), FieldPanel('status'), FieldPanel('validation'), FieldPanel('summary'), FieldPanel('last_processed_line'), + FieldPanel('parse_heartbeat_at'), AutocompletePanel('collection'), ] - base_form_class = CoreAdminModelForm - class Meta: verbose_name = _("Log File") verbose_name_plural = _("Log Files") @@ -268,25 +85,28 @@ def get(cls, hash): return cls.objects.get(hash=hash) @classmethod - def create_or_update(cls, user, collection, path, stat_result, hash, status=None): + def create_or_update(cls, collection, path, stat_result, hash, status=None): try: + obj, created = cls.objects.get_or_create( + hash=hash, + defaults={ + "collection": collection, + "path": path, + "stat_result": stat_result, + "status": status or choices.LOG_FILE_STATUS_CREATED, + }, + ) + except IntegrityError: obj = cls.get(hash=hash) - obj.updated_by = user + created = False + + if created: + logging.info(f'File {path} added to the database.') + else: obj.updated = timezone.now() + obj.save(update_fields=["updated"]) logging.info(f'File {path} already exists in the database.') - except cls.DoesNotExist: - obj = cls() - obj.creator = user - obj.created = timezone.now() - obj.collection = collection - obj.path = path - obj.stat_result = stat_result - obj.hash = hash - obj.status = status or choices.LOG_FILE_STATUS_CREATED - logging.info(f'File {path} added to the database.') - - obj.save() return obj def __str__(self): diff --git a/log_manager/tasks.py b/log_manager/tasks.py index e14fe92..10148b1 100644 --- a/log_manager/tasks.py +++ b/log_manager/tasks.py @@ -1,20 +1,17 @@ import logging -import json import os from django.conf import settings -from django.core.mail import send_mail from django.contrib.auth import get_user_model from django.utils.translation import gettext as _ from core.utils import date_utils -from core.utils.utils import _get_user +from core.utils.request_utils import _get_user from config import celery_app from collection.models import Collection -from log_manager_config import exceptions as lmc_exceptions, models as lmc_models +from log_manager_config import models as lmc_models from . import ( - exceptions, choices, models, utils, @@ -26,8 +23,8 @@ User = get_user_model() -@celery_app.task(bind=True, name=_('Search for log files'), queue='load') -def task_search_log_files(self, collections=[], from_date=None, until_date=None, days_to_go_back=None, user_id=None, username=None): +@celery_app.task(bind=True, name=_('[Log Pipeline] 1. Search Logs (Manual)'), queue='load') +def task_search_log_files(self, collections=[], from_date=None, until_date=None, days_to_go_back=None, user_id=None, username=None, trigger_validation=False): """ Task to search for log files in the directories defined in the CollectionLogDirectory model. @@ -44,11 +41,11 @@ def task_search_log_files(self, collections=[], from_date=None, until_date=None, for col in collections or Collection.acron3_list(): collection = Collection.objects.get(acron3=col) - col_configs_dirs = lmc_models.CollectionLogDirectory.objects.filter(collection__acron3=col, active=True) + col_configs_dirs = lmc_models.CollectionLogDirectory.objects.filter(config__collection__acron3=col, active=True) if len(col_configs_dirs) == 0: logging.error(f'No CollectionLogDirectory found for collection {col}.') - supported_logfile_extensions = lmc_models.SupportedLogFile.objects.values_list('file_extension', flat=True) + supported_logfile_extensions = settings.SUPPORTED_LOGFILE_EXTENSIONS if len(supported_logfile_extensions) == 0: logging.error('No SupportedLogFile found. Please, add a SupportedLogFile for each of the supported log file formats.') @@ -62,7 +59,18 @@ def task_search_log_files(self, collections=[], from_date=None, until_date=None, visible_dates = _get_visible_dates(from_date, until_date, days_to_go_back) logging.debug(f'Visible dates: {visible_dates}') - _add_log_file(user, collection, root, name, visible_dates) + _add_log_file(collection, root, name, visible_dates) + + if trigger_validation: + task_validate_log_files.apply_async(kwargs={ + "collections": collections, + "from_date": from_date, + "until_date": until_date, + "days_to_go_back": days_to_go_back, + "user_id": user_id, + "username": username, + "trigger_parse": True + }) def _get_visible_dates(from_date, until_date, days_to_go_back): @@ -70,14 +78,13 @@ def _get_visible_dates(from_date, until_date, days_to_go_back): return date_utils.get_date_objs_from_date_range(from_date_str, until_date_str) -def _add_log_file(user, collection, root, name, visible_dates): +def _add_log_file(collection, root, name, visible_dates): file_path = os.path.join(root, name) file_ctime = date_utils.get_date_obj_from_timestamp(os.stat(file_path).st_ctime) logging.debug(f'Checking file {file_path} with ctime {file_ctime}.') if file_ctime in visible_dates: models.LogFile.create_or_update( - user=user, collection=collection, path=file_path, stat_result=os.stat(file_path), @@ -85,8 +92,8 @@ def _add_log_file(user, collection, root, name, visible_dates): ) -@celery_app.task(bind=True, name=_('Validate log files'), timelimit=-1, queue='load') -def task_validate_log_files(self, collections=[], from_date=None, until_date=None, days_to_go_back=None, user_id=None, username=None, ignore_date=False): +@celery_app.task(bind=True, name=_('[Log Pipeline] 2. Validate Logs (Manual)'), timelimit=-1, queue='load') +def task_validate_log_files(self, collections=[], from_date=None, until_date=None, days_to_go_back=None, user_id=None, username=None, ignore_date=False, trigger_parse=False, revalidate=False, status_list=None): """ Task to validate log files in the database. @@ -98,22 +105,58 @@ def task_validate_log_files(self, collections=[], from_date=None, until_date=Non user_id (int, optional): The ID of the user initiating the task. Defaults to None. username (str, optional): The username of the user initiating the task. Defaults to None. ignore_date (bool, optional): If True, ignore the date of the log file. Defaults to False. + revalidate (bool, optional): If True, also revalidate files in statuses from status_list. Defaults to False. + status_list (list, optional): List of status codes to revalidate when revalidate=True. Defaults to [QUE, INV, ERR]. """ cols = collections or Collection.acron3_list() logging.info(f'Validating log files for collections: {cols}.') visible_dates = _get_visible_dates(from_date, until_date, days_to_go_back) if not ignore_date: + if not visible_dates: + logging.warning("No visible dates found for log validation.") + return logging.info(f'Interval: {visible_dates[0]} to {visible_dates[-1]}.') + status_filter = [choices.LOG_FILE_STATUS_CREATED] + if revalidate: + status_filter += status_list or [choices.LOG_FILE_STATUS_QUEUED, choices.LOG_FILE_STATUS_INVALIDATED, choices.LOG_FILE_STATUS_ERROR] + + tasks = [] for col in cols: - for log_file in models.LogFile.objects.filter(status=choices.LOG_FILE_STATUS_CREATED, collection__acron3=col): + for log_file in models.LogFile.objects.filter(status__in=status_filter, collection__acron3=col): file_ctime = date_utils.get_date_obj_from_timestamp(log_file.stat_result[LOGFILE_STAT_RESULT_CTIME_INDEX]) if file_ctime in visible_dates or ignore_date: - task_validate_log_file.apply_async(args=(log_file.hash, user_id, username)) - - -@celery_app.task(bind=True, name=_('Validate log file'), timelimit=-1, queue='load') + tasks.append(task_validate_log_file.s(log_file.hash, user_id, username)) + + if tasks: + if trigger_parse: + from celery import chord + from metrics.tasks import task_parse_logs + chord(tasks)(task_parse_logs.si( + collections=collections, + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, + user_id=user_id, + username=username, + )) + else: + for task in tasks: + task.apply_async() + elif trigger_parse: + from metrics.tasks import task_parse_logs + task_parse_logs.apply_async(kwargs={ + "collections": collections, + "from_date": from_date, + "until_date": until_date, + "days_to_go_back": days_to_go_back, + "user_id": user_id, + "username": username, + }) + + +@celery_app.task(bind=True, name=_('[Log Pipeline] Validate Single Log File (Auto)'), timelimit=-1, queue='load') def task_validate_log_file(self, log_file_hash, user_id=None, username=None): """ Task to validate a specific log file. @@ -135,21 +178,21 @@ def task_validate_log_file(self, log_file_hash, user_id=None, username=None): del val_result['content']['summary']['datetimes'] if 'probably_date' in val_result: - val_result['probably_date'] = date_utils.get_date_str(val_result['probably_date']) - - try: - log_file.validation = val_result - log_file.validation.update({'buffer_size': buffer_size, 'sample_size': sample_size}) - except json.JSONDecodeError as e: - logging.error(f'Error serializing validation result: {e}') - log_file.validation = {} + if isinstance(val_result['probably_date'], dict): + logging.error(f"Error determining probably_date: {val_result['probably_date'].get('error')}") + val_result['probably_date'] = None + else: + try: + val_result['probably_date'] = date_utils.get_date_str(val_result['probably_date']) + except (ValueError, AttributeError) as e: + logging.error(f'Error serializing probably_date: {e}') + val_result['probably_date'] = None + + log_file.validation = val_result + log_file.validation.update({'buffer_size': buffer_size, 'sample_size': sample_size}) if val_result.get('is_valid', {}).get('all', False): - models.LogFileDate.create_or_update( - user=user, - log_file=log_file, - date=val_result.get('probably_date', ''), - ) + log_file.date = val_result.get('probably_date') or None log_file.status = choices.LOG_FILE_STATUS_QUEUED else: @@ -160,116 +203,19 @@ def task_validate_log_file(self, log_file_hash, user_id=None, username=None): def _fetch_validation_parameters(collection, default_buffer_size=0.1, default_sample_size=2048): - col_configs_params = lmc_models.CollectionValidationParameters.objects.filter(collection__acron3=collection).first() - if not col_configs_params: - logging.warning(f'No CollectionValidationParameters found for collection {collection}. Using default values.') + col_configs = lmc_models.LogManagerCollectionConfig.objects.filter(collection__acron3=collection).first() + if not col_configs: + logging.warning(f'No LogManagerCollectionConfig found for collection {collection}. Using default values.') return default_buffer_size, default_sample_size - return col_configs_params.buffer_size, col_configs_params.sample_size + return col_configs.buffer_size, col_configs.sample_size -@celery_app.task(bind=True, name=_('Check missing log files')) -def task_check_missing_logs_for_date_range(self, collections=[], from_date=None, until_date=None, days_to_go_back=None, user_id=None, username=None): +@celery_app.task(bind=True, name=_('[Log Pipeline] Daily Routine (Auto)'), queue='load') +def task_daily_log_ingestion_pipeline(self): """ - Task to check for missing log files in the defined date range. - - Parameters: - collections (list, optional): List of collection acronyms. Defaults to []. - from_date (str, optional): The start date for log discovery in YYYY-MM-DD format. Defaults to None. - until_date (str, optional): The end date for log discovery in YYYY-MM-DD format. Defaults to None. - days_to_go_back (int, optional): The number of days to go back from today for log discovery. Defaults to None. - user_id (int, optional): The ID of the user initiating the task. Defaults to None. - username (str, optional): The username of the user initiating the task. Defaults to None. - - Raises: - exceptions.UndefinedCollectionFilesPerDayError: Raised when there are no expected log files for the collection. - exceptions.MultipleFilesPerDayForTheSameDateError: Raised when there are multiple expected log files for the same date. + Facade task for the daily log ingestion pipeline. + It initiates the Search -> Validate -> Parse chain using default parameters. + No arguments are required, making it easy to schedule periodically. """ - user = _get_user(self.request, username=username, user_id=user_id) - - from_date_str, until_date_str = date_utils.get_date_range_str(from_date, until_date, days_to_go_back) - - for col in collections or Collection.acron3_list(): - collection = Collection.objects.get(acron3=col) - for date in date_utils.get_date_objs_from_date_range(from_date_str, until_date_str): - logging.info(f'Couting logs for collection {col} and date {date}') - count_logs_for_date(user, collection, date) - - -def count_logs_for_date(user, collection, date): - try: - n_expected_files = lmc_models.CollectionLogFilesPerDay.get_number_of_expected_files_by_day(collection=collection.acron3, date=date) - except lmc_exceptions.UndefinedCollectionFilesPerDayError: - return - except lmc_exceptions.MultipleFilesPerDayForTheSameDateError: - return - - n_found_logs = models.LogFileDate.get_number_of_found_files_for_date(collection=collection.acron3, date=date) - - obj = models.CollectionLogFileDateCount.create_or_update( - user=user, - collection=collection, - date=date, - expected_log_files=n_expected_files, - found_log_files=n_found_logs, - ) - logging.info(f'Created CollectionLogFileDateCount object {obj}.') - - -@celery_app.task(bind=True, name=_('Generate log files count report')) -def task_log_files_count_status_report(self, collections=[], from_date=None, until_date=None, days_to_go_back=None, user_id=None, username=None): - from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back) - - from_date_obj = date_utils.get_date_obj(from_date) - until_date_obj = date_utils.get_date_obj(until_date) - - for collection in collections or Collection.acron3_list(): - col = models.Collection.objects.get(acron3=collection) - subject = _(f'Usage Log Validation Results ({from_date} to {until_date})') - message = _(f'This message provides the results of the Usage Log Validation for the period {from_date} to {until_date}:\n\n') - - missing = models.CollectionLogFileDateCount.objects.filter( - collection__acron3=collection, - status=choices.COLLECTION_LOG_FILE_DATE_COUNT_MISSING_FILES, - date__gte=from_date_obj, - date__lte=until_date_obj, - ) - extra = models.CollectionLogFileDateCount.objects.filter( - collection__acron3=collection, - status=choices.COLLECTION_LOG_FILE_DATE_COUNT_EXTRA_FILES, - date__gte=from_date_obj, - date__lte=until_date_obj, - ) - ok = models.CollectionLogFileDateCount.objects.filter( - collection__acron3=collection, - status=choices.COLLECTION_LOG_FILE_DATE_COUNT_OK, - date__gte=from_date_obj, - date__lte=until_date_obj, - ) - - if missing.count() > 0: - message += _(f'- There are {missing.count()} missing log files.\n') - if extra.count() > 0: - message += _(f'- There are {extra.count()} extra log files.\n') - if ok.count() > 0: - message += _(f'- There are {ok.count()} dates with correct log files.\n') - - if missing.count() > 0 or extra.count() > 0: - message += _(f'\nPlease review the script responsible for sharing the log files.\n') - - message += _(f'\nYou can view the full report at {settings.WAGTAILADMIN_BASE_URL}/admin/snippets/log_manager/collectionlogfiledatecount/?collection={col.pk}>.') - - logging.info(f'Sending email to collection {col.main_name}. Subject: {subject}. Message: {message}') - _send_message(subject, message, collection) - - -def _send_message(subject, message, collection): - collection_emails = lmc_models.CollectionEmail.objects.filter(collection__acron3=collection, active=True).values_list('email', flat=True) - if len(collection_emails) == 0: - raise exceptions.UndefinedCollectionConfigError(_("Error. Please, add an E-mail Configuration for the collection.")) - - send_mail( - subject=subject, - message=message, - from_email=settings.EMAIL_HOST_USER, - recipient_list=collection_emails - ) + logging.info("Starting Daily Log Ingestion Pipeline") + task_search_log_files.apply_async(kwargs={"trigger_validation": True}) diff --git a/log_manager/tests.py b/log_manager/tests.py index 7ce503c..51c1402 100644 --- a/log_manager/tests.py +++ b/log_manager/tests.py @@ -1,3 +1,58 @@ +from unittest.mock import patch + +from django.db import IntegrityError from django.test import TestCase -# Create your tests here. +from collection.models import Collection + +from . import choices, tasks +from .models import LogFile + + +class LogFileTests(TestCase): + def setUp(self): + self.collection = Collection.objects.create(acron3="books", acron2="bk") + + def test_create_or_update_creates_log_file(self): + log_file = LogFile.create_or_update( + collection=self.collection, + path="/tmp/new.log.gz", + stat_result={"size": 10}, + hash="1" * 32, + ) + + self.assertEqual(log_file.collection, self.collection) + self.assertEqual(log_file.path, "/tmp/new.log.gz") + self.assertEqual(log_file.status, choices.LOG_FILE_STATUS_CREATED) + + def test_create_or_update_refetches_existing_log_after_integrity_error(self): + existing = LogFile.objects.create( + collection=self.collection, + path="/tmp/existing.log.gz", + stat_result={"size": 10}, + hash="1" * 32, + status=choices.LOG_FILE_STATUS_CREATED, + ) + + with patch.object(LogFile.objects, "get_or_create", side_effect=IntegrityError): + log_file = LogFile.create_or_update( + collection=self.collection, + path="/tmp/existing.log.gz", + stat_result={"size": 10}, + hash=existing.hash, + ) + + self.assertEqual(log_file.pk, existing.pk) + + +class ValidateLogFilesTaskTests(TestCase): + def test_validate_log_files_returns_for_empty_visible_date_range(self): + with patch("log_manager.tasks.task_validate_log_file.s") as mocked_signature: + result = tasks.task_validate_log_files.run( + collections=["books"], + from_date="2024-02-02", + until_date="2024-02-01", + ) + + self.assertIsNone(result) + mocked_signature.assert_not_called() diff --git a/log_manager/utils.py b/log_manager/utils.py index 4a2b00b..c7dd2db 100644 --- a/log_manager/utils.py +++ b/log_manager/utils.py @@ -1,42 +1,47 @@ +import gzip import hashlib +from collections import deque from scielo_log_validator import validator -def hash_file(path, num_lines=25): +def hash_file(path, num_lines=500): """ - Calculates the MD5 hash of a file using a combination of its first and last `num_lines` lines, - as well as its size. - + Calculates the MD5 hash of a file using a combination of its first and last + `num_lines` lines. + + For gzip-compressed files, the content is decompressed before hashing, + so that different compressions of the same data produce the same hash. + File size is intentionally NOT included because it varies between + compressions and between growing log files, causing false duplicates. + Args: path (str): The path to the file. - num_lines (int): The number of lines to consider from the beginning and end of the file. Default is 25. + num_lines (int): The number of lines to consider from the beginning + and end of the file. Default is 500. Returns: The MD5 hash digest as a hexadecimal string. """ md5_hash = hashlib.md5() - with open(path, 'rb') as file: - # Read the first `num_lines` lines of the file + opener = gzip.open if _is_gzip(path) else open + + with opener(path, 'rb') as file: first_lines = b''.join([file.readline() for _ in range(num_lines)]) md5_hash.update(first_lines) - # Move the file pointer to the end of the file - file.seek(0, 2) + tail = deque(maxlen=num_lines) + for line in file: + tail.append(line) + md5_hash.update(b''.join(tail)) - # Get the size of the file - size = file.tell() - md5_hash.update(str(size).encode()) - - # Move the file pointer to the start of the file - file.seek(-size, 2) + return md5_hash.hexdigest() - # Read the last `num_lines` lines of the file - last_lines = file.readlines()[-num_lines:] - md5_hash.update(b''.join(last_lines)) - return md5_hash.hexdigest() +def _is_gzip(path): + with open(path, 'rb') as f: + return f.read(2) == b'\x1f\x8b' def validate_file(path, sample_size=0.1, buffer_size=2048, days_delta=5, apply_path_validation=True, apply_content_validation=True): return validator.pipeline_validate( diff --git a/log_manager/wagtail_hooks.py b/log_manager/wagtail_hooks.py index aeb6908..1548ad3 100644 --- a/log_manager/wagtail_hooks.py +++ b/log_manager/wagtail_hooks.py @@ -3,54 +3,10 @@ from wagtail.snippets.models import register_snippet from config.menu import get_menu_order +from log_manager_config.wagtail_hooks import LogManagerCollectionConfigSnippetViewSet +from metrics.wagtail_hooks import DailyMetricJobSnippetViewSet -from log_manager.models import ( - CollectionLogFileDateCount, - LogFile, - LogFileDate, -) - - -class LogFileDateViewSet(SnippetViewSet): - model = LogFileDate - menu_label = _("Log Files per Day") - icon = "folder" - menu_order = 300 - - list_display = ( - "date", - "log_file", - ) - list_filter = ( - "date", - "log_file__collection", - ) - search_fields = () - - -class CollectionLogFileDateCountViewSet(SnippetViewSet): - model = CollectionLogFileDateCount - menu_label = _("Expected and Found Log Files") - icon = "folder" - menu_order = 400 - - list_display = ( - "collection", - "date", - "found_log_files", - "expected_log_files", - "status", - "exported_files_count", - "is_usage_metric_computed", - ) - list_filter = ( - "collection", - "status", - "exported_files_count", - "is_usage_metric_computed", - "year", - "month" - ) +from log_manager.models import LogFile class LogFileSnippetViewSet(SnippetViewSet): @@ -60,16 +16,17 @@ class LogFileSnippetViewSet(SnippetViewSet): menu_order = 500 list_display = ( "path", - "stat_result", "collection", "status", + "date", "validation", "summary", "last_processed_line", + "parse_heartbeat_at", "hash" ) - list_filter = ("status", "collection") - search_fields = ("file",) + list_filter = ("status", "collection", "date") + search_fields = ("path", "hash", "collection__acron3", "collection__main_name") class LogSnippetViewSetGroup(SnippetViewSetGroup): @@ -78,9 +35,9 @@ class LogSnippetViewSetGroup(SnippetViewSetGroup): menu_icon = "folder-open-inverse" menu_order = get_menu_order("log_manager") items = ( - LogFileDateViewSet, - CollectionLogFileDateCountViewSet, + LogManagerCollectionConfigSnippetViewSet, LogFileSnippetViewSet, + DailyMetricJobSnippetViewSet, ) diff --git a/log_manager_config/exceptions.py b/log_manager_config/exceptions.py index ad7581a..0a6a6a9 100644 --- a/log_manager_config/exceptions.py +++ b/log_manager_config/exceptions.py @@ -4,11 +4,5 @@ class UndefinedCollectionLogDirectoryError(Exception): class UndefinedCollectionEmailError(Exception): ... -class UndefinedCollectionFilesPerDayError(Exception): - ... - class UndefinedSupportedLogFile(Exception): ... - -class MultipleFilesPerDayForTheSameDateError(Exception): - ... diff --git a/log_manager_config/migrations/0004_logmanagercollectionconfig_and_more.py b/log_manager_config/migrations/0004_logmanagercollectionconfig_and_more.py new file mode 100644 index 0000000..5b6351c --- /dev/null +++ b/log_manager_config/migrations/0004_logmanagercollectionconfig_and_more.py @@ -0,0 +1,223 @@ +# Generated by Django 5.2.12 on 2026-05-01 22:27 + +import django.db.models.deletion +import modelcluster.fields +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("collection", "0001_initial"), + ("log_manager_config", "0003_alter_collectionemail_options_and_more"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name="LogManagerCollectionConfig", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "created", + models.DateTimeField( + auto_now_add=True, verbose_name="Creation date" + ), + ), + ( + "updated", + models.DateTimeField( + auto_now=True, verbose_name="Last update date" + ), + ), + ( + "sample_size", + models.FloatField(default=0.1, verbose_name="Sample Size"), + ), + ( + "buffer_size", + models.IntegerField(default=2048, verbose_name="Buffer Size"), + ), + ( + "expected_logs_per_day", + models.IntegerField( + default=1, verbose_name="Expected Logs Per Day" + ), + ), + ], + options={ + "verbose_name": "Log Manager Collection Config", + "verbose_name_plural": "Log Manager Collection Configs", + }, + ), + migrations.RemoveField( + model_name="collectionlogfilesperday", + name="collection", + ), + migrations.RemoveField( + model_name="collectionlogfilesperday", + name="creator", + ), + migrations.RemoveField( + model_name="collectionlogfilesperday", + name="updated_by", + ), + migrations.RemoveField( + model_name="collectionurltranslatorclass", + name="collection", + ), + migrations.RemoveField( + model_name="collectionurltranslatorclass", + name="creator", + ), + migrations.RemoveField( + model_name="collectionurltranslatorclass", + name="directory", + ), + migrations.RemoveField( + model_name="collectionurltranslatorclass", + name="updated_by", + ), + migrations.RemoveField( + model_name="collectionvalidationparameters", + name="collection", + ), + migrations.RemoveField( + model_name="collectionvalidationparameters", + name="creator", + ), + migrations.RemoveField( + model_name="collectionvalidationparameters", + name="updated_by", + ), + migrations.RemoveField( + model_name="supportedlogfile", + name="creator", + ), + migrations.RemoveField( + model_name="supportedlogfile", + name="updated_by", + ), + migrations.RemoveConstraint( + model_name="collectionemail", + name="unique_collection_email", + ), + migrations.RemoveConstraint( + model_name="collectionlogdirectory", + name="unique_collection_path", + ), + migrations.RemoveField( + model_name="collectionemail", + name="collection", + ), + migrations.RemoveField( + model_name="collectionlogdirectory", + name="collection", + ), + migrations.AddField( + model_name="collectionemail", + name="sort_order", + field=models.IntegerField(blank=True, editable=False, null=True), + ), + migrations.AddField( + model_name="collectionlogdirectory", + name="sort_order", + field=models.IntegerField(blank=True, editable=False, null=True), + ), + migrations.AddField( + model_name="collectionlogdirectory", + name="translator_class", + field=models.CharField( + default="URLTranslatorClassicSite", verbose_name="URL Translator Class" + ), + ), + migrations.AddField( + model_name="logmanagercollectionconfig", + name="collection", + field=models.OneToOneField( + on_delete=django.db.models.deletion.CASCADE, + related_name="log_manager_config", + to="collection.collection", + verbose_name="Collection", + ), + ), + migrations.AddField( + model_name="logmanagercollectionconfig", + name="creator", + field=models.ForeignKey( + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_creator", + to=settings.AUTH_USER_MODEL, + verbose_name="Creator", + ), + ), + migrations.AddField( + model_name="logmanagercollectionconfig", + name="updated_by", + field=models.ForeignKey( + blank=True, + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_last_mod_user", + to=settings.AUTH_USER_MODEL, + verbose_name="Updater", + ), + ), + migrations.AddField( + model_name="collectionemail", + name="config", + field=modelcluster.fields.ParentalKey( + blank=True, + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="emails", + to="log_manager_config.logmanagercollectionconfig", + ), + ), + migrations.AddField( + model_name="collectionlogdirectory", + name="config", + field=modelcluster.fields.ParentalKey( + blank=True, + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="directories", + to="log_manager_config.logmanagercollectionconfig", + ), + ), + migrations.AddConstraint( + model_name="collectionemail", + constraint=models.UniqueConstraint( + fields=("config", "email"), name="unique_config_email" + ), + ), + migrations.AddConstraint( + model_name="collectionlogdirectory", + constraint=models.UniqueConstraint( + fields=("config", "path"), name="unique_config_path" + ), + ), + migrations.DeleteModel( + name="CollectionLogFilesPerDay", + ), + migrations.DeleteModel( + name="CollectionURLTranslatorClass", + ), + migrations.DeleteModel( + name="CollectionValidationParameters", + ), + migrations.DeleteModel( + name="SupportedLogFile", + ), + ] diff --git a/log_manager_config/models.py b/log_manager_config/models.py index 384368e..8cf3e34 100644 --- a/log_manager_config/models.py +++ b/log_manager_config/models.py @@ -4,38 +4,57 @@ from django.utils import timezone from django.utils.translation import gettext_lazy as _ +from modelcluster.models import ClusterableModel +from modelcluster.fields import ParentalKey +from wagtail.models import Orderable +from wagtail.admin.panels import FieldPanel, InlinePanel +from wagtailautocomplete.edit_handlers import AutocompletePanel + from collection.models import Collection from core.models import CommonControlField -from .exceptions import MultipleFilesPerDayForTheSameDateError, UndefinedCollectionFilesPerDayError -class CollectionLogDirectory(CommonControlField): - collection = models.ForeignKey( +class LogManagerCollectionConfig(ClusterableModel, CommonControlField): + collection = models.OneToOneField( Collection, verbose_name=_('Collection'), - on_delete=models.DO_NOTHING, + on_delete=models.CASCADE, + related_name="log_manager_config" ) - path = models.CharField( - verbose_name=_('Path'), - max_length=255, - blank=False, + sample_size = models.FloatField( + verbose_name=_('Sample Size'), + blank=False, null=False, + default=0.1, ) - directory_name = models.CharField( - verbose_name=_('Directory Name'), - max_length=255, - blank=True, - null=True, + buffer_size = models.IntegerField( + verbose_name=_('Buffer Size'), + blank=False, + null=False, + default=2048, ) - active = models.BooleanField( - verbose_name=_('Active'), - default=True, + expected_logs_per_day = models.IntegerField( + verbose_name=_('Expected Logs Per Day'), + default=1, ) + panels = [ + AutocompletePanel("collection"), + FieldPanel("sample_size"), + FieldPanel("buffer_size"), + FieldPanel("expected_logs_per_day"), + InlinePanel("directories", label=_("Directories")), + InlinePanel("emails", label=_("Emails")), + ] + def __str__(self): - return f'{self.collection} - {self.path} - {self.directory_name}' - + return f'{self.collection.acron3} Config' + + class Meta: + verbose_name = _('Log Manager Collection Config') + verbose_name_plural = _('Log Manager Collection Configs') + @classmethod def load(cls, data, user): for item in data: @@ -45,13 +64,12 @@ def load(cls, data, user): logging.warning(f'Collection {item.get("acronym")} not found.') continue - logging.info(item) cls.create_or_update( user=user, collection=collection, - directory_name=item.get('directory_name'), - path=item.get('path'), - active=item.get('active', True), + sample_size=item.get('sample_size', 0.1), + buffer_size=item.get('buffer_size', 2048), + expected_logs_per_day=item.get('quantity', 1), ) @classmethod @@ -59,81 +77,66 @@ def create_or_update( cls, user, collection, - directory_name, - path, - active, + sample_size, + buffer_size, + expected_logs_per_day, ): - try: - obj = cls.objects.get(collection=collection, path=path) - except cls.DoesNotExist: - obj = cls() + obj, created = cls.objects.get_or_create(collection=collection) + if created: obj.creator = user obj.created = timezone.now() - obj.collection = collection obj.updated_by = user obj.updated = timezone.now() - obj.directory_name = directory_name - obj.path = path - obj.active = active - + obj.sample_size = sample_size + obj.buffer_size = buffer_size + obj.expected_logs_per_day = expected_logs_per_day obj.save() - logging.info(f'{collection.acron3} - {directory_name} - {path}') + logging.info(f'Config for {collection.acron3} updated.') return obj - class Meta: - verbose_name = _('Collection Log Directory') - verbose_name_plural = _('Collection Log Directories') - constraints = [ - models.UniqueConstraint(fields=['collection', 'path'], name='unique_collection_path') - ] -class CollectionLogFilesPerDay(CommonControlField): - collection = models.ForeignKey( - Collection, - verbose_name=_('Collection'), - on_delete=models.DO_NOTHING, +class CollectionLogDirectory(Orderable, CommonControlField): + config = ParentalKey( + 'LogManagerCollectionConfig', + related_name='directories', + on_delete=models.CASCADE, + null=True, + blank=True, ) - start_date = models.DateField( - verbose_name=_('Start Date'), - blank=False, + path = models.CharField( + verbose_name=_('Path'), + max_length=255, + blank=False, null=False, ) - end_date = models.DateField( - verbose_name=_('End Date'), + directory_name = models.CharField( + verbose_name=_('Directory Name'), + max_length=255, blank=True, null=True, ) - quantity = models.IntegerField( - verbose_name=_('Quantity'), - default=1, + active = models.BooleanField( + verbose_name=_('Active'), + default=True, + ) + translator_class = models.CharField( + verbose_name=_('URL Translator Class'), + blank=False, + null=False, + default='URLTranslatorClassicSite', ) def __str__(self): - return f'{self.start_date} - {self.quantity}' + return f'{self.config.collection} - {self.path} - {self.directory_name}' - @classmethod - def get_number_of_expected_files_by_day(cls, collection, date): - files_by_day = cls.objects.filter( - models.Q(collection__acron3=collection) & - models.Q(start_date__lte=date) & - (models.Q(end_date__gte=date) | models.Q(end_date__isnull=True)) - ) - - if files_by_day.count() > 1: - raise MultipleFilesPerDayForTheSameDateError(_("ERROR. Please, set the field end_date for the collection {collection}.")) - - if files_by_day.count() == 0: - raise UndefinedCollectionFilesPerDayError(_("ERROR. Please, set the number of files per day for the collection {collection}.")) - - return int(files_by_day.get().quantity) - @classmethod def load(cls, data, user): for item in data: try: collection = Collection.objects.get(acron3=item.get('acronym')) + config, _ = LogManagerCollectionConfig.objects.get_or_create(collection=collection) except Collection.DoesNotExist: logging.warning(f'Collection {item.get("acronym")} not found.') continue @@ -141,52 +144,55 @@ def load(cls, data, user): logging.info(item) cls.create_or_update( user=user, - collection=collection, - start_date=item.get('start_date'), - quantity=item.get('quantity'), - end_date=item.get('end_date'), + config=config, + directory_name=item.get('directory_name'), + path=item.get('path'), + active=item.get('active', True), ) @classmethod def create_or_update( cls, user, - collection, - start_date, - quantity, - end_date, + config, + directory_name, + path, + active, ): try: - obj = cls.objects.get(collection=collection, start_date=start_date) + obj = cls.objects.get(config=config, path=path) except cls.DoesNotExist: obj = cls() obj.creator = user obj.created = timezone.now() - obj.collection = collection - + obj.config = config + obj.updated_by = user obj.updated = timezone.now() - obj.start_date = start_date - obj.quantity = quantity - obj.end_date = end_date - + obj.directory_name = directory_name + obj.path = path + obj.active = active + obj.save() - logging.info(f'{collection.acron3} - {start_date} - {quantity}') + logging.info(f'{config.collection.acron3} - {directory_name} - {path}') return obj class Meta: - verbose_name = _('Collection Log Files Per Day') - verbose_name_plural = _('Collection Log Files Per Day') + verbose_name = _('Collection Log Directory') + verbose_name_plural = _('Collection Log Directories') constraints = [ - models.UniqueConstraint(fields=['collection', 'start_date'], name='unique_collection_start_date') + models.UniqueConstraint(fields=['config', 'path'], name='unique_config_path') ] -class CollectionEmail(CommonControlField): - collection = models.ForeignKey( - Collection, - verbose_name=_('Collection'), - on_delete=models.DO_NOTHING, + +class CollectionEmail(Orderable, CommonControlField): + config = ParentalKey( + 'LogManagerCollectionConfig', + related_name='emails', + on_delete=models.CASCADE, + null=True, + blank=True, ) name = models.CharField( verbose_name=_('Name'), @@ -218,6 +224,7 @@ def load(cls, data, user): for item in data: try: collection = Collection.objects.get(acron3=item.get('acronym')) + config, _ = LogManagerCollectionConfig.objects.get_or_create(collection=collection) except Collection.DoesNotExist: logging.warning(f'Collection {item.get("acronym")} not found.') continue @@ -225,7 +232,7 @@ def load(cls, data, user): logging.info(item) cls.create_or_update( user=user, - collection=collection, + config=config, email=item.get('e-mail'), name=item.get('name'), position=item.get('position'), @@ -236,19 +243,19 @@ def load(cls, data, user): def create_or_update( cls, user, - collection, + config, email, name, position, active, ): try: - obj = cls.objects.get(collection=collection, email=email) + obj = cls.objects.get(config=config, email=email) except cls.DoesNotExist: obj = cls() obj.creator = user obj.created = timezone.now() - obj.collection = collection + obj.config = config obj.email = email obj.updated_by = user @@ -258,213 +265,14 @@ def create_or_update( obj.active = active obj.save() - logging.info(f'{collection.acron3} - {name} - {position} - {email}') + logging.info(f'{config.collection.acron3} - {name} - {position} - {email}') return obj class Meta: verbose_name = _('Collection Email') verbose_name_plural = _('Collection Emails') constraints = [ - models.UniqueConstraint(fields=['collection', 'email'], name='unique_collection_email') - ] - - -class CollectionValidationParameters(CommonControlField): - collection = models.ForeignKey( - Collection, - verbose_name=_('Collection'), - on_delete=models.DO_NOTHING, - primary_key=True, - ) - sample_size = models.FloatField( - verbose_name=_('Sample Size'), - blank=False, - null=False, - default=0.1, - ) - buffer_size = models.IntegerField( - verbose_name=_('Buffer Size'), - blank=False, - null=False, - default=2048, - ) - - def __str__(self): - return f'{self.collection.acron3} - {self.sample_size} - {self.buffer_size}' - - @classmethod - def load(cls, data, user): - for item in data: - try: - collection = Collection.objects.get(acron3=item.get('acronym')) - except Collection.DoesNotExist: - logging.warning(f'Collection {item.get("acronym")} not found.') - continue - - logging.info(item) - cls.create_or_update( - user=user, - collection=collection, - sample_size=item.get('sample_size'), - buffer_size=item.get('buffer_size'), - ) - - @classmethod - def create_or_update( - cls, - user, - collection, - sample_size, - buffer_size, - ): - try: - obj = cls.objects.get(collection=collection) - except cls.DoesNotExist: - obj = cls() - obj.creator = user - obj.created = timezone.now() - obj.collection = collection - - obj.updated_by = user - obj.updated = timezone.now() - obj.sample_size = sample_size - obj.buffer_size = buffer_size - - obj.save() - logging.info(f'{collection.acron3} - {sample_size} - {buffer_size}') - return obj - - class Meta: - verbose_name = _('Collection Validation Parameters') - verbose_name_plural = _('Collection Validation Parameters') - - -class CollectionURLTranslatorClass(CommonControlField): - collection = models.ForeignKey( - Collection, - verbose_name=_('Collection'), - on_delete=models.DO_NOTHING, - ) - directory = models.ForeignKey( - CollectionLogDirectory, - verbose_name=_('Directory'), - on_delete=models.DO_NOTHING, - ) - translator_class = models.CharField( - verbose_name=_('URL Translator Class'), - blank=False, - null=False, - default='URLTranslatorClassicSite', - ) - - def __str__(self): - return f'{self.collection.acron3} - {self.directory} - {self.translator_class}' - - class Meta: - verbose_name = _('Collection URL Translator Class') - verbose_name_plural = _('Collection URL Translator Classes') - constraints = [ - models.UniqueConstraint(fields=['collection', 'directory'], name='unique_collection_directory') + models.UniqueConstraint(fields=['config', 'email'], name='unique_config_email') ] - @classmethod - def load(cls, data, user): - for item in data: - try: - collection = Collection.objects.get(acron3=item.get('acronym')) - except Collection.DoesNotExist: - logging.warning(f'Collection {item.get("acronym")} not found.') - continue - - try: - directory = CollectionLogDirectory.objects.get(collection=collection, path=item.get('path')) - logging.info(item) - cls.create_or_update( - user=user, - collection=collection, - directory=directory, - translator_class=item.get('translator_class'), - ) - except CollectionLogDirectory.DoesNotExist: - logging.warning(f'Directory {item.get("path")} not found.') - continue - @classmethod - def create_or_update( - cls, - user, - collection, - directory, - translator_class, - ): - try: - obj = cls.objects.get(collection=collection) - except cls.DoesNotExist: - obj = cls() - obj.creator = user - obj.created = timezone.now() - obj.collection = collection - obj.directory = directory - - obj.updated_by = user - obj.updated = timezone.now() - obj.translator_class = translator_class - - obj.save() - logging.info(f'{collection.acron3} - {directory.path} - {translator_class}') - return obj - - -class SupportedLogFile(CommonControlField): - file_extension = models.CharField( - verbose_name=_('File Extension'), - max_length=255, - unique=True, - blank=False, - null=False, - ) - description = models.TextField( - verbose_name=_('Description'), - blank=True, - null=True, - ) - - def __str__(self): - return f'{self.file_extension}' - - @classmethod - def load(cls, data, user): - for item in data: - logging.info(item) - cls.create_or_update( - user=user, - file_extension=item.get('file_extension'), - description=item.get('description'), - ) - - @classmethod - def create_or_update( - cls, - user, - file_extension, - description, - ): - try: - obj = cls.objects.get(file_extension=file_extension) - except cls.DoesNotExist: - obj = cls() - obj.creator = user - obj.created = timezone.now() - - obj.updated_by = user - obj.updated = timezone.now() - obj.file_extension = file_extension - obj.description = description - - obj.save() - logging.info(f'{file_extension}') - return obj - - class Meta: - verbose_name = _('Supported Log File') - verbose_name_plural = _('Supported Log Files') diff --git a/log_manager_config/tasks.py b/log_manager_config/tasks.py index f15262b..c4ff399 100644 --- a/log_manager_config/tasks.py +++ b/log_manager_config/tasks.py @@ -1,59 +1,25 @@ -from django.contrib.auth import get_user_model +from django.conf import settings from django.utils.translation import gettext as _ -from core.utils.utils import _get_user from config import celery_app +from config.collections import COLLECTION_SIZE_SAMPLE_MAP, LOG_MANAGER_SEED_DATA +from core.utils.request_utils import _get_user from . import models -User = get_user_model() - - -@celery_app.task(bind=True, name=_('Load log manager collection settings')) -def task_load_log_manager_collection_settings(self, data={}, user_id=None, username=None): +@celery_app.task(bind=True, name=_('[Log Pipeline] Load Log Manager Settings (Seed)')) +def task_load_log_manager_collection_settings(self, data=None, user_id=None, username=None): user = _get_user(self.request, username=username, user_id=user_id) if not data: - data = [ - {'acronym': 'arg', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.ar', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'}, - {'acronym': 'bol', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.bo', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'}, - {'acronym': 'chl', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.cl', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'}, - {'acronym': 'col', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.co', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'}, - {'acronym': 'cri', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.cr', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'}, - {'acronym': 'cub', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.cu', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'}, - {'acronym': 'data', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-dataverse', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'dataverse'}, - {'acronym': 'ecu', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.ec', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'}, - {'acronym': 'esp', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.es', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'}, - {'acronym': 'mex', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.mx', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'}, - {'acronym': 'per', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.pe', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'}, - {'acronym': 'preprints', 'directory_name': _('Site clássico') , 'path': '/app/logs/submission-node01', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'preprints'}, - {'acronym': 'prt', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.pt', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'}, - {'acronym': 'pry', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.py', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'}, - {'acronym': 'psi', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.pepsic', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'}, - {'acronym': 'rve', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.revenf', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'}, - {'acronym': 'rvt', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.revtur', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'}, - {'acronym': 'scl', 'directory_name': _('Site novo') , 'path': '/app/logs/bkp-ratchet/scielo.nbr', 'quantity': 2, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'opac'}, - {'acronym': 'spa', 'directory_name': _('Site novo - versão prévia') , 'path': '/app/logs/bkp-ratchet/scielo.sp', 'quantity': 2, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'opac_alpha'}, - {'acronym': 'sss', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.ss', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'}, - {'acronym': 'sza', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.za', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'}, - {'acronym': 'ury', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.uy', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'}, - {'acronym': 'ven', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.ve', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'}, - {'acronym': 'wid', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.wi', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'}, - ] + data = LOG_MANAGER_SEED_DATA for i in data: - i['sample_size'] = 0.1 if i['acronym'] not in ['data', 'preprints', 'wid'] else 1.0 + size = getattr(settings, 'COLLECTION_ACRON3_SIZE_MAP', {}).get(i['acronym'], 'small') + i['sample_size'] = COLLECTION_SIZE_SAMPLE_MAP.get(size, 1.0) i['buffer_size'] = 2048 - data_extensions = [ - {'file_extension': '.log', 'description': ''}, - {'file_extension': '.gz', 'description': ''} - ] - + models.LogManagerCollectionConfig.load(data, user) models.CollectionLogDirectory.load(data, user) models.CollectionEmail.load(data, user) - models.CollectionLogFilesPerDay.load(data, user) - models.CollectionValidationParameters.load(data, user) - models.CollectionURLTranslatorClass.load(data, user) - models.SupportedLogFile.load(data_extensions, user) \ No newline at end of file diff --git a/log_manager_config/wagtail_hooks.py b/log_manager_config/wagtail_hooks.py index 2ecf908..f91c0b1 100644 --- a/log_manager_config/wagtail_hooks.py +++ b/log_manager_config/wagtail_hooks.py @@ -1,134 +1,24 @@ from django.utils.translation import gettext_lazy as _ -from wagtail.snippets.views.snippets import SnippetViewSet, SnippetViewSetGroup -from wagtail.snippets.models import register_snippet +from wagtail.snippets.views.snippets import SnippetViewSet -from config.menu import get_menu_order +from log_manager_config.models import LogManagerCollectionConfig -from log_manager_config.models import ( - CollectionLogDirectory, - CollectionLogFilesPerDay, - CollectionEmail, - CollectionValidationParameters, - CollectionURLTranslatorClass, - SupportedLogFile, -) - - -class CollectionLogDirectorySnippetViewSet(SnippetViewSet): - model = CollectionLogDirectory - menu_label = _("Collection Log Directory") - icon = "folder" +class LogManagerCollectionConfigSnippetViewSet(SnippetViewSet): + model = LogManagerCollectionConfig + menu_label = _("Log Manager Configurations") + icon = "cogs" menu_order = 300 - list_display = ( - "collection", - "directory_name", - "path", - "active", - ) - list_filter = ( - "collection", - "active", - ) - search_fields = ( - "path", - ) - - -class CollectionLogFilesPerDaySnippetViewSet(SnippetViewSet): - model = CollectionLogFilesPerDay - menu_label = _("Collection Log Files Per Day") - icon = "folder" - menu_order = 400 - - list_display = ( - "collection", - "start_date", - "end_date", - "quantity", - ) - list_filter = ( - "collection", - ) - - -class CollectionEmailSnippetViewSet(SnippetViewSet): - model = CollectionEmail - menu_label = _("Collection Email") - icon = "folder" - menu_order = 500 - - list_display = ( - "collection", - "name", - "position", - "email", - "active", - ) - list_filter = ( - "collection", - "active", - ) - search_fields = ( - "name", - "email" - ) - -class CollectionValidationParametersSnippetViewSet(SnippetViewSet): - model = CollectionValidationParameters - menu_label = _("Collection Validation Parameters") - icon = "folder" - menu_order = 550 - list_display = ( "collection", "sample_size", "buffer_size", + "expected_logs_per_day", + "updated", ) list_filter = ( "collection", ) - -class CollectionURLTranslatorClassSnippetViewSet(SnippetViewSet): - model = CollectionURLTranslatorClass - menu_label = _("Collection URL Translator Class") - icon = "folder" - menu_order = 600 - - list_display = ( - "collection", - "directory", - "translator_class", - ) - list_filter = ( - "collection", - ) - -class SupportedLogFileSnippetViewSet(SnippetViewSet): - model = SupportedLogFile - menu_label = _("Supported Log File Formats") - icon = "folder" - menu_order = 600 - - list_display = ( - "file_extension", - "description", - ) - - -class LogManagerConfigSnippetViewSetGroup(SnippetViewSetGroup): - menu_name = 'log_manager_config' - menu_label = _("Log Manager Config") - menu_icon = "folder-open-inverse" - menu_order = get_menu_order("log_manager_config") - items = ( - CollectionLogDirectorySnippetViewSet, - CollectionLogFilesPerDaySnippetViewSet, - CollectionEmailSnippetViewSet, - CollectionValidationParametersSnippetViewSet, - CollectionURLTranslatorClassSnippetViewSet, - SupportedLogFileSnippetViewSet, + search_fields = ( + "collection__acron3", ) - - -register_snippet(LogManagerConfigSnippetViewSetGroup) diff --git a/merge_production_dotenvs_in_dotenv.py b/merge_production_dotenvs_in_dotenv.py deleted file mode 100644 index d1170ef..0000000 --- a/merge_production_dotenvs_in_dotenv.py +++ /dev/null @@ -1,67 +0,0 @@ -import os -from pathlib import Path -from typing import Sequence - -import pytest - -ROOT_DIR_PATH = Path(__file__).parent.resolve() -PRODUCTION_DOTENVS_DIR_PATH = ROOT_DIR_PATH / ".envs" / ".production" -PRODUCTION_DOTENV_FILE_PATHS = [ - PRODUCTION_DOTENVS_DIR_PATH / ".django", - PRODUCTION_DOTENVS_DIR_PATH / ".postgres", -] -DOTENV_FILE_PATH = ROOT_DIR_PATH / ".env" - - -def merge( - output_file_path: str, merged_file_paths: Sequence[str], append_linesep: bool = True -) -> None: - with open(output_file_path, "w") as output_file: - for merged_file_path in merged_file_paths: - with open(merged_file_path, "r") as merged_file: - merged_file_content = merged_file.read() - output_file.write(merged_file_content) - if append_linesep: - output_file.write(os.linesep) - - -def main(): - merge(DOTENV_FILE_PATH, PRODUCTION_DOTENV_FILE_PATHS) - - -@pytest.mark.parametrize("merged_file_count", range(3)) -@pytest.mark.parametrize("append_linesep", [True, False]) -def test_merge(tmpdir_factory, merged_file_count: int, append_linesep: bool): - tmp_dir_path = Path(str(tmpdir_factory.getbasetemp())) - - output_file_path = tmp_dir_path / ".env" - - expected_output_file_content = "" - merged_file_paths = [] - for i in range(merged_file_count): - merged_file_ord = i + 1 - - merged_filename = ".service{}".format(merged_file_ord) - merged_file_path = tmp_dir_path / merged_filename - - merged_file_content = merged_filename * merged_file_ord - - with open(merged_file_path, "w+") as file: - file.write(merged_file_content) - - expected_output_file_content += merged_file_content - if append_linesep: - expected_output_file_content += os.linesep - - merged_file_paths.append(merged_file_path) - - merge(output_file_path, merged_file_paths, append_linesep) - - with open(output_file_path, "r") as output_file: - actual_output_file_content = output_file.read() - - assert actual_output_file_content == expected_output_file_content - - -if __name__ == "__main__": - main() diff --git a/metrics/counter/__init__.py b/metrics/counter/__init__.py new file mode 100644 index 0000000..c9afd92 --- /dev/null +++ b/metrics/counter/__init__.py @@ -0,0 +1,22 @@ +from .access import ( + extract_item_access_data, + is_valid_item_access_data, + update_results_with_item_access_data, +) +from .documents import convert_raw_results_to_index_documents +from .identifiers import ( + generate_item_access_id, + generate_month_document_id, + generate_user_session_id, + generate_year_document_id, +) +from .parser import ( + extract_date_from_validation_dict, + translator_class_name_to_obj, +) +from metrics.opensearch.names import ( + extract_access_month, + extract_access_year, + generate_month_index_name, + generate_year_index_name, +) diff --git a/metrics/counter/access.py b/metrics/counter/access.py new file mode 100644 index 0000000..12c0cc5 --- /dev/null +++ b/metrics/counter/access.py @@ -0,0 +1,438 @@ +import re +from urllib.parse import unquote, urlparse + +from scielo_usage_counter.values import ( + CONTENT_TYPE_UNDEFINED, + DEFAULT_SCIELO_ISSN, + MEDIA_LANGUAGE_UNDEFINED, + MEDIA_FORMAT_UNDEFINED, +) + +from core.utils.standardizer import ( + standardize_language_code, + standardize_pid_generic, + standardize_pid_v2, + standardize_pid_v3, + standardize_year_of_publication, +) +from core.utils.date_utils import extract_minute_second_key, truncate_datetime_to_hour +from metrics.counter.identifiers import generate_item_access_id, generate_user_session_id + + +def extract_item_access_data(collection_acron3: str, translated_url: dict): + if not translated_url or not isinstance(translated_url, dict): + return {} + + source_type = _extract_source_type(collection_acron3, translated_url) + source_id = _extract_source_id(collection_acron3, translated_url, source_type) + scielo_issn = _extract_scielo_issn(translated_url, source_type, source_id) + document_type = _extract_document_type(collection_acron3, translated_url, source_type) + publication_year = _safe_standardize( + standardize_year_of_publication, + translated_url.get("year_of_publication"), + ) + source_access_type = translated_url.get("source_access_type") + + return { + "collection": collection_acron3, + "source_type": source_type, + "source_id": source_id, + "scielo_issn": scielo_issn, + "document_type": document_type, + "pid_v2": _safe_standardize(standardize_pid_v2, translated_url.get("pid_v2")), + "pid_v3": _safe_standardize(standardize_pid_v3, translated_url.get("pid_v3")), + "pid_generic": _safe_standardize( + standardize_pid_generic, + translated_url.get("pid_generic"), + ), + "title_pid_generic": _safe_standardize( + standardize_pid_generic, + translated_url.get("title_pid_generic"), + ), + "segment_pid_generics": _standardize_pid_generic_list( + translated_url.get("segment_pid_generics"), + ), + "media_language": _safe_standardize( + standardize_language_code, + translated_url.get("media_language"), + default="un", + ), + "media_format": translated_url.get("media_format"), + "content_type": translated_url.get("content_type"), + "access_url": translated_url.get("access_url") or translated_url.get("normalized_url"), + "publication_year": publication_year, + "counter_access_type": _counter_access_type(source_access_type), + "access_method": "Regular", + "source_main_title": _extract_source_title(translated_url), + "source_subject_area_capes": translated_url.get("source_subject_area_capes") + or translated_url.get("journal_subject_area_capes"), + "source_subject_area_wos": translated_url.get("source_subject_area_wos") + or translated_url.get("journal_subject_area_wos"), + "source_acronym": translated_url.get("source_acronym") + or translated_url.get("journal_acronym"), + "source_publisher_name": translated_url.get("source_publisher_name") + or translated_url.get("journal_publisher_name"), + "source_access_type": source_access_type, + "source_identifiers": _extract_source_identifiers(translated_url, source_id, source_type), + "source_city": translated_url.get("source_city"), + "source_country": translated_url.get("source_country"), + } + + +def is_valid_item_access_data(data: dict, utm=None, ignore_utm_validation=False): + if not isinstance(data, dict): + return False, {"message": "Invalid data format. Expected a dictionary.", "code": "invalid_format"} + + scielo_issn = data.get("scielo_issn") + source_id = data.get("source_id") + source_type = data.get("source_type") + document_type = data.get("document_type") or "article" + media_format = data.get("media_format") + media_language = data.get("media_language") + content_type = data.get("content_type") + pid_v2 = data.get("pid_v2") + pid_v3 = data.get("pid_v3") + pid_generic = data.get("pid_generic") + has_source_identity = bool(source_id) or bool( + scielo_issn and scielo_issn != DEFAULT_SCIELO_ISSN + ) + has_media_language = bool(media_language and media_language != MEDIA_LANGUAGE_UNDEFINED) + has_pid = bool(pid_v2 or pid_v3 or pid_generic) + + if not all([media_format and media_format != MEDIA_FORMAT_UNDEFINED, content_type and content_type != CONTENT_TYPE_UNDEFINED, has_pid]): + return False, {"message": "Missing required fields in item access data.", "code": "missing_fields"} + + if document_type in {"article", "book", "chapter"} and not has_media_language: + return False, {"message": "Missing media language in item access data.", "code": "missing_fields"} + + if document_type == "article" and not has_source_identity: + return False, {"message": "Missing article source identity.", "code": "missing_fields"} + + if document_type in {"book", "chapter"} and not source_id: + return False, {"message": "Missing book source identity.", "code": "missing_fields"} + + if document_type in {"preprint", "dataset"} and not pid_generic: + return False, {"message": "Missing generic PID in item access data.", "code": "missing_fields"} + + if utm and not ignore_utm_validation: + if ( + source_type == "journal" + and scielo_issn + and scielo_issn != DEFAULT_SCIELO_ISSN + and not utm.is_valid_code(scielo_issn, utm.sources_metadata["issn_set"]) + ): + return False, {"message": f"Invalid scielo_issn: {scielo_issn}", "code": "invalid_scielo_issn"} + + if ( + source_type + and source_type != "journal" + and source_id + and source_id not in utm.sources_metadata.get("source_id_to_type", {}) + ): + return False, {"message": f"Invalid source_id: {source_id}", "code": "invalid_source_id"} + + if pid_v2 and not utm.is_valid_code(pid_v2, utm.documents_metadata["pid_set"]): + return False, {"message": f"Invalid pid_v2: {pid_v2}", "code": "invalid_pid_v2"} + + if pid_v3 and not utm.is_valid_code(pid_v3, utm.documents_metadata["pid_set"]): + return False, {"message": f"Invalid pid_v3: {pid_v3}", "code": "invalid_pid_v3"} + + if pid_generic and not utm.is_valid_code(pid_generic, utm.documents_metadata["pid_set"]): + return False, {"message": f"Invalid pid_generic: {pid_generic}", "code": "invalid_pid_generic"} + + return True, {"message": "Item access data is valid.", "code": "valid"} + + +def update_results_with_item_access_data(results: dict, item_access_data: dict, line: dict): + col_acron3 = item_access_data.get("collection") + source_key = ( + item_access_data.get("source_id") + or item_access_data.get("scielo_issn") + or item_access_data.get("source_type") + or col_acron3 + ) + pid_v2 = item_access_data.get("pid_v2") + pid_v3 = item_access_data.get("pid_v3") + media_format = item_access_data.get("media_format") + content_language = item_access_data.get("media_language") + content_type = item_access_data.get("content_type") + access_url = item_access_data.get("access_url") or _normalize_access_url(line.get("url")) + + client_name = line.get("client_name") + client_version = line.get("client_version") + local_datetime = line.get("local_datetime") + access_country_code = line.get("country_code") + ip_address = line.get("ip_address") + + truncated_datetime = truncate_datetime_to_hour(local_datetime) + ms_key = extract_minute_second_key(local_datetime) + if truncated_datetime is None or ms_key is None: + raise ValueError("Invalid local_datetime in parsed log line.") + + access_date = truncated_datetime.strftime("%Y-%m-%d") + access_year = access_date[:4] + access_month = access_date[:7].replace("-", "") + + user_session_id = generate_user_session_id( + client_name, + client_version, + ip_address, + truncated_datetime, + ) + + for access_target in _iter_access_targets(item_access_data): + item_access_id = generate_item_access_id( + user_session_id=user_session_id, + col_acron3=col_acron3, + source_key=source_key, + pid_v2=pid_v2, + pid_v3=pid_v3, + pid_generic=access_target.get("pid_generic"), + content_language=content_language, + access_country_code=access_country_code, + media_format=media_format, + content_type=content_type, + ) + + if item_access_id not in results: + results[item_access_id] = { + "collection": col_acron3, + "source_key": source_key, + "document_type": access_target.get("document_type"), + "pid_v2": pid_v2, + "pid_v3": pid_v3, + "pid_generic": access_target.get("pid_generic"), + "title_pid_generic": ( + item_access_data.get("title_pid_generic") + or access_target.get("pid_generic") + ), + "user_session_id": user_session_id, + "click_timestamps": {ms_key: 0}, + "click_timestamps_by_url": {}, + "access_url": access_url, + "media_format": media_format, + "content_language": content_language, + "content_type": content_type, + "access_country_code": access_country_code, + "access_date": access_date, + "access_year": access_year, + "access_month": access_month, + "publication_year": item_access_data.get("publication_year"), + "counter_access_type": item_access_data.get("counter_access_type") or "Open", + "access_method": item_access_data.get("access_method") or "Regular", + "source": { + "source_type": item_access_data.get("source_type"), + "source_id": item_access_data.get("source_id"), + "scielo_issn": item_access_data.get("scielo_issn"), + "main_title": item_access_data.get("source_main_title"), + "identifiers": item_access_data.get("source_identifiers"), + "access_type": item_access_data.get("source_access_type"), + "city": item_access_data.get("source_city"), + "country": item_access_data.get("source_country"), + "subject_area_capes": item_access_data.get("source_subject_area_capes"), + "subject_area_wos": item_access_data.get("source_subject_area_wos"), + "acronym": item_access_data.get("source_acronym"), + "publisher_name": item_access_data.get("source_publisher_name"), + }, + } + + if ms_key not in results[item_access_id]["click_timestamps"]: + results[item_access_id]["click_timestamps"][ms_key] = 0 + + results[item_access_id]["click_timestamps"][ms_key] += 1 + + access_url_key = access_url or _fallback_access_url_key( + access_target.get("pid_generic"), + media_format, + content_type, + ) + timestamps_by_url = results[item_access_id].setdefault("click_timestamps_by_url", {}) + url_timestamps = timestamps_by_url.setdefault(access_url_key, {}) + if ms_key not in url_timestamps: + url_timestamps[ms_key] = 0 + url_timestamps[ms_key] += 1 + + +def _extract_source_type(collection_acron3, translated_url): + source_type = translated_url.get("source_type") + if source_type: + return source_type + + if collection_acron3 == "preprints": + return "preprint_server" + + if collection_acron3 == "data": + return "data_repository" + + if collection_acron3 == "books": + return "book" + + if translated_url.get("book_id"): + return "book" + + if ( + translated_url.get("scielo_issn") + and translated_url.get("scielo_issn") != DEFAULT_SCIELO_ISSN + ): + return "journal" + + if translated_url.get("journal_acronym") or translated_url.get("journal_main_title"): + return "journal" + + return "other" + + +def _extract_source_id(collection_acron3, translated_url, source_type): + source_id = translated_url.get("source_id") + if source_id: + return source_id + + if source_type == "preprint_server": + return translated_url.get("preprint_server_id") or "scielo-preprints" + + if source_type == "data_repository": + return translated_url.get("repository_id") or "scielo-data" + + if source_type == "book": + return ( + translated_url.get("book_id") + or _extract_book_id_from_pid(translated_url.get("title_pid_generic")) + or _extract_book_id_from_pid(translated_url.get("pid_generic")) + ) + + if source_type == "journal": + return translated_url.get("scielo_issn") + + return None + + +def _extract_scielo_issn(translated_url, source_type, source_id): + scielo_issn = translated_url.get("scielo_issn") + if scielo_issn: + return scielo_issn + + if source_type == "journal" and source_id: + return source_id + + if source_type in {"book", "other"} or translated_url.get("book_id"): + return DEFAULT_SCIELO_ISSN + + return None + + +def _extract_source_title(translated_url): + return ( + translated_url.get("source_main_title") + or translated_url.get("journal_main_title") + or translated_url.get("book_title") + ) + + +def _extract_document_type(collection_acron3, translated_url, source_type): + document_type = translated_url.get("document_type") + if document_type: + return document_type + + if collection_acron3 == "preprints": + return "preprint" + + if collection_acron3 == "data": + return "dataset" + + if collection_acron3 == "books" or source_type == "book": + pid_generic = translated_url.get("pid_generic") or "" + if translated_url.get("chapter_id") or "/CHAPTER:" in pid_generic.upper(): + return "chapter" + if translated_url.get("book_id"): + return "book" + return "book" + + if source_type == "journal": + return "article" + + return "article" + + +def _extract_source_identifiers(translated_url, source_id, source_type): + identifiers = translated_url.get("source_identifiers") + if isinstance(identifiers, dict): + compact = {key: value for key, value in identifiers.items() if value not in (None, "", [], {}, ())} + if compact: + return compact + + if source_type != "book": + return None + + compact = { + "book_id": source_id or translated_url.get("book_id"), + "isbn": translated_url.get("isbn"), + "eisbn": translated_url.get("eisbn"), + "doi": translated_url.get("doi"), + } + compact = {key: value for key, value in compact.items() if value not in (None, "", [], {}, ())} + return compact or None + + +def _extract_book_id_from_pid(value): + if not value: + return None + normalized = str(value).upper() + if not normalized.startswith("BOOK:"): + return None + return normalized.split("BOOK:", 1)[1].split("/", 1)[0] or None + + +def _counter_access_type(source_access_type): + normalized = str(source_access_type or "").strip().lower() + if normalized == "commercial": + return "Controlled" + if normalized in {"free_to_read", "free-to-read", "free"}: + return "Free_To_Read" + return "Open" + + +def _safe_standardize(func, value, default=""): + try: + return func(value) + except Exception: + return default + + +def _standardize_pid_generic_list(values): + if not isinstance(values, (list, tuple, set)): + return [] + items = [] + for value in values: + item = _safe_standardize(standardize_pid_generic, value) + if item and item not in items: + items.append(item) + return items + + +def _iter_access_targets(item_access_data): + return [ + { + "pid_generic": item_access_data.get("pid_generic"), + "document_type": item_access_data.get("document_type"), + } + ] + + +def _normalize_access_url(url): + if not url: + return None + parsed_url = urlparse(str(url).strip()) + path = parsed_url.path if parsed_url.scheme or parsed_url.netloc else str(url).strip() + path = unquote(path or "") + path = path.split("?", 1)[0].split("#", 1)[0].split()[0] + path = re.sub(r"/+", "/", path) + path = path.rstrip(".,;:") + return path or None + + +def _fallback_access_url_key(pid_generic, media_format, content_type): + return "|".join([ + str(pid_generic or ""), + str(media_format or ""), + str(content_type or ""), + ]) diff --git a/metrics/counter/aggregation.py b/metrics/counter/aggregation.py new file mode 100644 index 0000000..d047e7a --- /dev/null +++ b/metrics/counter/aggregation.py @@ -0,0 +1,124 @@ +from scielo_usage_counter.counter import get_valid_clicks, is_request + + +def apply_unique_metrics( + document, + unique_state, + scope, + document_id, + user_session_id, + is_request_event, +): + if not user_session_id: + return + + inv_bucket = unique_state[f"{scope}_investigations"] + inv_key = (document_id, user_session_id) + add_investigation = inv_key not in inv_bucket + if add_investigation: + inv_bucket.add(inv_key) + + add_request = False + if is_request_event: + req_bucket = unique_state[f"{scope}_requests"] + req_key = (document_id, user_session_id) + add_request = req_key not in req_bucket + if add_request: + req_bucket.add(req_key) + + increment_document_uniques( + document=document, + add_investigation=add_investigation, + add_request=add_request, + ) + + +def increment_document_totals(document, click_timestamps, content_type, click_timestamps_by_url=None): + number_of_clicks = _count_valid_clicks( + click_timestamps=click_timestamps, + click_timestamps_by_url=click_timestamps_by_url, + ) + + document["total_investigations"] += number_of_clicks + if is_request(content_type): + document["total_requests"] += number_of_clicks + + if "daily_metrics" in document: + day_key = list(document["daily_metrics"].keys())[0] + document["daily_metrics"][day_key]["total_investigations"] += number_of_clicks + if is_request(content_type): + document["daily_metrics"][day_key]["total_requests"] += number_of_clicks + + +def _count_valid_clicks(click_timestamps, click_timestamps_by_url=None): + if isinstance(click_timestamps_by_url, dict) and click_timestamps_by_url: + return sum( + get_valid_clicks(timestamps or {}) + for timestamps in click_timestamps_by_url.values() + ) + return get_valid_clicks(click_timestamps or {}) + + +def increment_document_uniques(document, add_investigation=False, add_request=False): + if add_investigation: + document["unique_investigations"] += 1 + if add_request: + document["unique_requests"] += 1 + + if "daily_metrics" in document: + day_key = list(document["daily_metrics"].keys())[0] + if add_investigation: + document["daily_metrics"][day_key]["unique_investigations"] += 1 + if add_request: + document["daily_metrics"][day_key]["unique_requests"] += 1 + + +def counter_data_type(document_type): + if document_type == "dataset": + return "Dataset" + if document_type in {"article", "preprint"}: + return "Article" + if document_type == "book": + return "Book" + if document_type == "chapter": + return "Book_Segment" + return "Other" + + +def parent_data_type(document_type, source_type=None): + if document_type == "chapter": + return "Book" + if document_type == "article" and source_type == "journal": + return "Journal" + return None + + +def article_version(document_type): + if document_type == "preprint": + return "Preprint" + return None + + +def should_create_book_item_document(value): + if not value.get("pid_generic"): + return False + if value.get("document_type") == "book" and not is_request(value.get("content_type")): + return False + return True + + +def extract_title_pid_generic(value, fallback=None): + title_pid_generic = value.get("title_pid_generic") + if title_pid_generic: + return title_pid_generic + + pid_generic = value.get("pid_generic") + if "/CHAPTER:" in (pid_generic or "").upper(): + return pid_generic.upper().split("/CHAPTER:")[0] + + source = value.get("source") or {} + source_id = source.get("source_id") + if source_id: + return f"BOOK:{str(source_id).upper()}" + + return fallback diff --git a/metrics/counter/documents.py b/metrics/counter/documents.py new file mode 100644 index 0000000..63730ae --- /dev/null +++ b/metrics/counter/documents.py @@ -0,0 +1,322 @@ +from scielo_usage_counter.counter import is_request + +from metrics.counter.aggregation import ( + apply_unique_metrics, + article_version, + counter_data_type, + extract_title_pid_generic, + increment_document_totals, + parent_data_type, + should_create_book_item_document, +) +from metrics.counter.identifiers import generate_month_document_id, generate_year_document_id + + +def convert_to_month_index_documents(data: dict): + if not isinstance(data, dict): + return {} + + metrics_data = {} + unique_state = _initialize_unique_state() + + for value in data.values(): + _accumulate_documents( + data=metrics_data, + unique_state=unique_state, + value=value, + granularity="month", + ) + + return metrics_data + + +def convert_to_year_index_documents(data: dict): + if not isinstance(data, dict): + return {} + + metrics_data = {} + unique_state = _initialize_unique_state() + + for value in data.values(): + _accumulate_documents( + data=metrics_data, + unique_state=unique_state, + value=value, + granularity="year", + ) + + return metrics_data + + +def convert_raw_results_to_index_documents(data: dict): + return { + "month": convert_to_month_index_documents(data), + "year": convert_to_year_index_documents(data), + } + + +def _initialize_unique_state(): + return { + "item_investigations": set(), + "item_requests": set(), + "title_investigations": set(), + "title_requests": set(), + } + + +def _accumulate_documents(data, unique_state, value, granularity): + if not isinstance(value, dict): + return + + if value.get("collection") == "books": + _accumulate_books_documents(data, unique_state, value, granularity) + return + + _accumulate_standard_documents(data, unique_state, value, granularity) + + +def _accumulate_standard_documents(data, unique_state, value, granularity): + document_id = _generate_document_id(value, granularity) + document = data.setdefault( + document_id, + _build_base_document(value=value, granularity=granularity), + ) + + increment_document_totals( + document=document, + click_timestamps=value.get("click_timestamps"), + click_timestamps_by_url=value.get("click_timestamps_by_url"), + content_type=value.get("content_type"), + ) + apply_unique_metrics( + document=document, + unique_state=unique_state, + scope="item", + document_id=document_id, + user_session_id=value.get("user_session_id"), + is_request_event=is_request(value.get("content_type")), + ) + + +def _accumulate_books_documents(data, unique_state, value, granularity): + if should_create_book_item_document(value): + item_document_id = _generate_document_id( + value, + granularity, + metric_scope="item", + ) + item_document = data.setdefault( + item_document_id, + _build_base_document( + value=value, + granularity=granularity, + metric_scope="item", + ), + ) + increment_document_totals( + document=item_document, + click_timestamps=value.get("click_timestamps"), + click_timestamps_by_url=value.get("click_timestamps_by_url"), + content_type=value.get("content_type"), + ) + apply_unique_metrics( + document=item_document, + unique_state=unique_state, + scope="item", + document_id=item_document_id, + user_session_id=value.get("user_session_id"), + is_request_event=is_request(value.get("content_type")), + ) + + title_pid_generic = extract_title_pid_generic(value) + if not title_pid_generic: + return + + title_document_id = _generate_document_id( + value, + granularity, + metric_scope="title", + pid_generic=title_pid_generic, + ) + title_document = data.setdefault( + title_document_id, + _build_base_document( + value=value, + granularity=granularity, + metric_scope="title", + pid_generic=title_pid_generic, + document_type="book", + ), + ) + increment_document_totals( + document=title_document, + click_timestamps=value.get("click_timestamps"), + click_timestamps_by_url=value.get("click_timestamps_by_url"), + content_type=value.get("content_type"), + ) + apply_unique_metrics( + document=title_document, + unique_state=unique_state, + scope="title", + document_id=title_document_id, + user_session_id=value.get("user_session_id"), + is_request_event=is_request(value.get("content_type")), + ) + + +def _generate_document_id(value, granularity, metric_scope=None, pid_generic=None): + pid_generic = pid_generic or value.get("pid_generic") + publication_year = str(value.get("publication_year") or "0001") + if granularity == "month": + access_month = value.get("access_date", "")[:7] if value.get("access_date") else "" + return generate_month_document_id( + collection=value.get("collection"), + source_key=value.get("source_key"), + pid_v2=value.get("pid_v2"), + pid_v3=value.get("pid_v3"), + pid_generic=pid_generic, + access_month=access_month, + counter_access_type=value.get("counter_access_type") or "Open", + access_method=value.get("access_method") or "Regular", + publication_year=publication_year, + metric_scope="title" if metric_scope == "title" else None, + ) + + return generate_year_document_id( + collection=value.get("collection"), + source_key=value.get("source_key"), + pid_v2=value.get("pid_v2"), + pid_v3=value.get("pid_v3"), + pid_generic=pid_generic, + content_language=value.get("content_language"), + access_country_code=value.get("access_country_code"), + access_year=value.get("access_year"), + counter_access_type=value.get("counter_access_type") or "Open", + access_method=value.get("access_method") or "Regular", + publication_year=publication_year, + metric_scope="title" if metric_scope == "title" else None, + ) + + +def _build_base_document(value, granularity, metric_scope=None, pid_generic=None, document_type=None): + collection = value.get("collection") + if collection == "books": + normalized_pid_generic = pid_generic or value.get("pid_generic") + title_pid_generic = extract_title_pid_generic(value, fallback=normalized_pid_generic) + base_document = { + "collection": collection, + "source": _build_books_source(value.get("source")), + "document_type": document_type or value.get("document_type"), + "scielo_document_type": document_type or value.get("document_type"), + "metric_scope": metric_scope or "item", + "counter_data_type": "Book" if metric_scope == "title" else "Book_Segment", + "parent_data_type": "Book" if metric_scope != "title" else None, + "title_pid_generic": title_pid_generic, + "pid": normalized_pid_generic, + "pid_generic": normalized_pid_generic, + "publication_year": value.get("publication_year"), + "counter_access_type": value.get("counter_access_type") or "Open", + "access_method": value.get("access_method") or "Regular", + "total_requests": 0, + "total_investigations": 0, + "unique_requests": 0, + "unique_investigations": 0, + } + _apply_access_fields(base_document, value, granularity) + if granularity == "year": + base_document["content_language"] = value.get("content_language") + base_document["access_country_code"] = value.get("access_country_code") + return base_document + + base_document = { + "collection": collection, + "source": _build_standard_source(value.get("source")), + "document_type": value.get("document_type"), + "scielo_document_type": value.get("document_type"), + "metric_scope": "item", + "counter_data_type": counter_data_type(value.get("document_type")), + "parent_data_type": parent_data_type( + value.get("document_type"), + (value.get("source") or {}).get("source_type"), + ), + "article_version": article_version(value.get("document_type")), + "pid": value.get("pid_v3") or value.get("pid_v2") or value.get("pid_generic"), + "pid_v2": value.get("pid_v2"), + "pid_v3": value.get("pid_v3"), + "pid_generic": value.get("pid_generic"), + "publication_year": value.get("publication_year"), + "counter_access_type": value.get("counter_access_type") or "Open", + "access_method": value.get("access_method") or "Regular", + "total_requests": 0, + "total_investigations": 0, + "unique_requests": 0, + "unique_investigations": 0, + } + _apply_access_fields(base_document, value, granularity) + if granularity == "year": + base_document["content_language"] = value.get("content_language") + base_document["access_country_code"] = value.get("access_country_code") + return base_document + + +def _apply_access_fields(base_document, value, granularity): + if granularity == "month": + base_document["access_month"] = value.get("access_date", "")[:7] if value.get("access_date") else "" + day = value.get("access_date", "")[-2:] if value.get("access_date") else "01" + base_document["daily_metrics"] = { + day: { + "total_requests": 0, + "total_investigations": 0, + "unique_requests": 0, + "unique_investigations": 0, + } + } + return + + base_document["access_year"] = value.get("access_year") + + +def _build_books_source(source): + source = source or {} + identifiers = source.get("identifiers") or {} + compact_identifiers = { + key: value + for key, value in identifiers.items() + if key in {"book_id", "isbn", "eisbn", "doi"} and value not in (None, "", [], {}, ()) + } + + return { + "source_type": source.get("source_type"), + "source_id": source.get("source_id"), + "main_title": source.get("main_title"), + "access_type": source.get("access_type"), + "publisher": source.get("publisher_name"), + "city": source.get("city"), + "country": source.get("country"), + "identifiers": compact_identifiers, + } + + +def _build_standard_source(source): + source = source or {} + identifiers = source.get("identifiers") or {} + compact_identifiers = { + key: value + for key, value in identifiers.items() + if value not in (None, "", [], {}, ()) + } + + return { + "source_type": source.get("source_type"), + "source_id": source.get("source_id"), + "scielo_issn": source.get("scielo_issn"), + "main_title": source.get("main_title"), + "acronym": source.get("acronym"), + "publisher_name": source.get("publisher_name"), + "subject_area_capes": source.get("subject_area_capes"), + "subject_area_wos": source.get("subject_area_wos"), + "access_type": source.get("access_type"), + "city": source.get("city"), + "country": source.get("country"), + "identifiers": compact_identifiers, + } diff --git a/metrics/counter/identifiers.py b/metrics/counter/identifiers.py new file mode 100644 index 0000000..bef7b8d --- /dev/null +++ b/metrics/counter/identifiers.py @@ -0,0 +1,110 @@ +def generate_user_session_id(client_name, client_version, ip_address, datetime, sep="|"): + dt_year_month_day = datetime.strftime("%Y-%m-%d") + dt_hour = datetime.strftime("%H") + + return sep.join( + [ + str(client_name), + str(client_version), + str(ip_address), + str(dt_year_month_day), + str(dt_hour), + ] + ) + + +def generate_item_access_id( + col_acron3, + source_key, + pid_v2, + pid_v3, + pid_generic, + user_session_id, + access_country_code, + content_language, + media_format, + content_type, + sep="|", +): + return sep.join( + [ + col_acron3, + str(source_key or ""), + pid_v2 or "", + pid_v3 or "", + pid_generic or "", + str(user_session_id or ""), + str(access_country_code or ""), + str(content_language or ""), + str(media_format or ""), + str(content_type or ""), + ] + ) + + +def generate_month_document_id( + collection: str, + source_key: str, + pid_v2: str, + pid_v3: str, + pid_generic: str, + access_month: str, + counter_access_type: str, + access_method: str, + publication_year: str, + metric_scope: str = None, +) -> str: + parts = [] + if metric_scope: + parts.append(metric_scope) + + parts.extend( + [ + str(collection or ""), + str(source_key or ""), + pid_v2 or "", + pid_v3 or "", + pid_generic or "", + str(access_month or ""), + str(counter_access_type or ""), + str(access_method or ""), + str(publication_year or ""), + ] + ) + return "|".join(parts) + + +def generate_year_document_id( + collection: str, + source_key: str, + pid_v2: str, + pid_v3: str, + pid_generic: str, + content_language: str, + access_country_code: str, + access_year: str, + counter_access_type: str, + access_method: str, + publication_year: str, + metric_scope: str = None, +) -> str: + parts = [] + if metric_scope: + parts.append(metric_scope) + + parts.extend( + [ + str(collection or ""), + str(source_key or ""), + pid_v2 or "", + pid_v3 or "", + pid_generic or "", + content_language or "", + access_country_code or "", + str(access_year or ""), + str(counter_access_type or ""), + str(access_method or ""), + str(publication_year or ""), + ] + ) + return "|".join(parts) diff --git a/metrics/utils/parser_utils.py b/metrics/counter/parser.py similarity index 92% rename from metrics/utils/parser_utils.py rename to metrics/counter/parser.py index ef142e6..2081e5d 100644 --- a/metrics/utils/parser_utils.py +++ b/metrics/counter/parser.py @@ -1,6 +1,7 @@ import logging from scielo_usage_counter.translator.classic import URLTranslatorClassicSite +from scielo_usage_counter.translator.books import URLTranslatorBooksSite from scielo_usage_counter.translator.dataverse import URLTranslatorDataverseSite from scielo_usage_counter.translator.opac import URLTranslatorOPACSite from scielo_usage_counter.translator.opac_alpha import URLTranslatorOPACAlphaSite @@ -38,6 +39,7 @@ def translator_class_name_to_obj(name: str): return None translator_classes = { + 'books': URLTranslatorBooksSite, 'classic': URLTranslatorClassicSite, 'dataverse': URLTranslatorDataverseSite, 'opac': URLTranslatorOPACSite, diff --git a/metrics/es.py b/metrics/es.py deleted file mode 100644 index 25ad701..0000000 --- a/metrics/es.py +++ /dev/null @@ -1,385 +0,0 @@ -import logging - -from elasticsearch import Elasticsearch, helpers, NotFoundError -from django.conf import settings - -from .utils import index_utils - - -DEFAULT_ES_INDEX_USAGE_MAPPINGS = { - "properties": { - "collection": { - "type": "keyword" - }, - "journal": { - "properties": { - "scielo_issn": { - "type": "keyword" - }, - "main_title": { - "type": "keyword" - }, - "subject_area_capes": { - "type": "keyword" - }, - "subject_area_wos": { - "type": "keyword" - }, - "acronym": { - "type": "keyword" - }, - "publisher": { - "type": "keyword" - } - } - }, - "pid": { - "type": "keyword" - }, - "pid_v2": { - "type": "keyword" - }, - "pid_v3": { - "type": "keyword" - }, - "pid_generic": { - "type": "keyword" - }, - "year_of_publication": { - "type": "integer" - }, - "media_language": { - "type": "keyword" - }, - "country_code": { - "type": "keyword" - }, - "date": { - "type": "date", - "format": "yyyy-MM-dd" - }, - "total_requests": { - "type": "integer" - }, - "total_investigations": { - "type": "integer" - }, - "unique_requests": { - "type": "integer" - }, - "unique_investigations": { - "type": "integer" - } - } -} - - -class ElasticSearchUsageWrapper: - """ - Wrapper for Elasticsearch usage metrics operations. - This class provides methods to interact with Elasticsearch for indexing, - deleting, and managing usage metrics data. - """ - - def __init__(self, url=None, basic_auth=None, api_key=None, verify_certs=False): - self.client = self.get_elasticsearch_client(url, basic_auth, api_key, verify_certs) - - - def get_elasticsearch_client(self, url=None, basic_auth=None, api_key=None, verify_certs=False): - """ - Create an Elasticsearch client instance using Django settings. - - :param url: Elasticsearch URL. If None, it will be taken from Django settings. - :param basic_auth: Basic authentication credentials. If None, it will be taken from Django settings. - :param api_key: API key. If None, it will be taken from Django settings. - :param verify_certs: Whether to verify SSL certificates. If None, it will be taken from Django settings. - """ - if not url: - url = getattr(settings, "ES_URL", None) - - if not basic_auth: - basic_auth = getattr(settings, "ES_BASIC_AUTH", None) - - if not api_key: - api_key = getattr(settings, "ES_API_KEY", None) - - if not verify_certs: - verify_certs = getattr(settings, "ES_VERIFY_CERTS", False) - - if basic_auth: - client = Elasticsearch(url, basic_auth=basic_auth, verify_certs=verify_certs) - elif api_key: - client = Elasticsearch(url, api_key=api_key, verify_certs=verify_certs) - else: - client = Elasticsearch(url, verify_certs=verify_certs) - - return client - - - def ping(self): - """ - Check if the Elasticsearch client is available. - Returns True if the client is available, False otherwise. - """ - try: - return self.client.ping() - except Exception as e: - logging.error(f"Error pinging Elasticsearch client: {e}") - return False - - - def create_index(self, index_name, mappings=None, ping_client=False): - """ - Create an Elasticsearch index. - - :param index_name: Name of the index to create. - :param mappings: Mappings for the index. If None, default mappings will be used. - :param ping_client: If True, checks if the Elasticsearch client is available before creating the index. - """ - if ping_client and not self.ping(): - return - - if not mappings: - mappings = DEFAULT_ES_INDEX_USAGE_MAPPINGS - - resp = self.client.indices.create( - index=index_name, - mappings=mappings, - ) - logging.info(f"Index {index_name} created: {resp}") - - - def create_index_if_not_exists(self, index_name, mappings=None, ping_client=False): - """ - Create an Elasticsearch index if it does not already exist. - - :param index_name: Name of the index to create. - :param mappings: Mappings for the index. If None, default mappings will be used. - :param ping_client: If True, checks if the Elasticsearch client is available before creating the index. - """ - if ping_client and not self.ping(): - return - - if not self.client.indices.exists(index=index_name): - self.create_index(index_name, mappings, ping_client) - else: - logging.info(f"Index {index_name} already exists. Skipping creation.") - - - def delete_index(self, index_name, ping_client=False): - """ - Delete an Elasticsearch index. - - :param index_name: Name of the index to delete. - :param ping_client: If True, checks if the Elasticsearch client is available before deleting the index. - """ - if ping_client and not self.ping(): - return - - self.client.indices.delete(index=index_name) - - - def index_document(self, index_name, doc_id, document, ping_client=False): - """ - Index a document in Elasticsearch. - - :param index_name: Name of the index. - :param doc_id: ID of the document. - :param document: Document to index. - :param ping_client: If True, checks if the Elasticsearch client is available before indexing the document. - """ - if ping_client and not self.ping(): - return - - self.client.index(index=index_name, id=doc_id, document=document) - - - def index_documents(self, index_name, documents, ping_client=False): - """ - Index multiple documents in Elasticsearch. - - :param index_name: Name of the index. - :param documents: Dictionary of documents to index, where keys are document IDs and values are the documents. - :param ping_client: If True, checks if the Elasticsearch client is available before indexing the documents. - """ - if ping_client and not self.ping(): - return - - helpers.bulk( - self.client, - ( - { - "_index": index_name, - "_id": doc_id, - "_source": document, - } - for doc_id, document in documents.items() - ), - ) - - - def delete_document(self, index_name, doc_id, ping_client=False): - """ - Delete a document from Elasticsearch. - - :param index_name: Name of the index. - :param doc_id: ID of the document to delete. - :param ping_client: If True, checks if the Elasticsearch client is available before deleting the document. - """ - if ping_client and not self.ping(): - return - - try: - self.client.delete(index=index_name, id=doc_id) - except NotFoundError as e: - logging.error(f"Failed to delete document {doc_id} from Elasticsearch: {e}") - - - def delete_documents(self, index_name, doc_ids, ping_client=False): - """ - Delete multiple documents from Elasticsearch using bulk. - :param index_name: Name of the index. - :param doc_ids: List of document IDs to delete. - :param ping_client: If True, checks if the Elasticsearch client is available before deleting the documents. - """ - if ping_client and not self.ping(): - return - - actions = ( - { - "_op_type": "delete", - "_index": index_name, - "_id": doc_id, - } - for doc_id in doc_ids - ) - - try: - helpers.bulk(self.client, actions) - except helpers.BulkIndexError as e: - logging.error(f"BulkIndexError occurred: {e.errors}") - - - def delete_documents_by_key(self, index_name, data, ping_client=False): - """ - Delete multiple documents from Elasticsearch based on specific key-value pairs. - - :param index_name: Name of the index. - :param data: Dictionary where keys are field names and values are single values or lists of values. - :param ping_client: If True, checks if the Elasticsearch client is available before deleting the documents. - """ - if ping_client and not self.ping(): - return - - query = { - "query": { - "bool": { - "must": [ - { - "terms": { - key: values if isinstance(values, list) else [values] - } - } - for key, values in data.items() - ] - } - } - } - - try: - self.client.delete_by_query(index=index_name, body=query) - return True - except Exception as e: - logging.error(f"Failed to delete documents: {e}") - - return False - - - def fetch_and_update_documents_locally(self, index_name, documents, batch_size=5000, ping_client=False): - """ - Fetch existing documents from Elasticsearch and update local documents with accumulated metrics. - This function retrieves documents from Elasticsearch in batches and merges their metric fields - with the provided local documents. The merge operation adds values for specific metric fields - or sets them if they don't exist in the local documents. - - Args: - index_name (str): Name of the Elasticsearch index to fetch documents from. - documents (dict): Dictionary of documents to be updated, where keys are document IDs and values - are dictionaries containing metric data. - batch_size (int, optional): Number of documents to fetch in each batch from Elasticsearch. - Defaults to 5000. - ping_client (bool, optional): If True, checks if the Elasticsearch client is available before - fetching documents. Defaults to False. - - Returns: - None: The function modifies the input documents dictionary in-place. - """ - if ping_client and not self.ping(): - return - - existing_docs = {} - ids = list(documents.keys()) - - for i in range(0, len(ids), batch_size): - batch_ids = ids[i:i+batch_size] - resp = self.client.mget(index=index_name, ids=batch_ids) - for doc in resp.get('docs', []): - if doc.get('found'): - existing_docs[doc['_id']] = doc['_source'] - logging.info(f'Found {len(existing_docs)} existing documents in Elasticsearch for update.') - - for doc_id, existing in existing_docs.items(): - current = documents[doc_id] - for field in [ - "total_requests", - "unique_requests", - "total_investigations", - "unique_investigations", - ]: - if field in existing and field in current: - current[field] += existing[field] - elif field in existing: - current[field] = existing[field] - - - def export_to_index(self, index_name, data, batch_size=5000, ping_client=False): - """ - Export data to Elasticsearch index in bulk operations. - This function converts input data to index documents, processes them locally, - and then indexes them to Elasticsearch in batches to optimize performance. - - Args: - index_name (str): Name of the Elasticsearch index to export data to. - data: The data to be exported to the Elasticsearch index - batch_size (int, optional): Number of documents to process in each bulk operation. - Defaults to 5000. - ping_client (bool, optional): If True, checks if the Elasticsearch client is available - - Returns: - None: Function performs side effects by indexing data to Elasticsearch - """ - if ping_client and not self.ping(): - return - - bulk_data = [] - documents = index_utils.convert_to_index_documents(data) - self.fetch_and_update_documents_locally(index_name=index_name, documents=documents) - - for key, metric_data in documents.items(): - metric_data['pid'] = metric_data.get('pid_v3') or metric_data.get('pid_v2') or metric_data.get('pid_generic', '') - bulk_data.append({ - "_id": key, - "_source": metric_data, - }) - - if len(bulk_data) >= batch_size: - self.index_documents( - index_name=index_name, - documents={doc["_id"]: doc["_source"] for doc in bulk_data}, - ) - bulk_data = [] - - self.index_documents( - index_name=index_name, - documents={doc["_id"]: doc["_source"] for doc in bulk_data}, - ) diff --git a/metrics/fixtures/top100articles.csv b/metrics/fixtures/top100articles.csv deleted file mode 100755 index 9d979f3..0000000 --- a/metrics/fixtures/top100articles.csv +++ /dev/null @@ -1,97 +0,0 @@ -print_issn online_issn pid_issn collection pid yop year_month_day total_item_requests total_item_investigations unique_item_requests unique_item_investigations -0002-7014 1851-8044 0002-7014 arg S0002-70142005000300005 2005 2024-05-26 13 16 13 16 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000100020 2009 2024-05-26 9 10 8 9 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200012 2009 2024-05-26 8 9 8 9 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200018 2009 2024-05-26 8 8 8 8 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000300004 2005 2024-05-26 8 11 8 11 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200011 2009 2024-05-26 8 9 8 9 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200001 2009 2024-05-26 7 7 7 7 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200010 2009 2024-05-26 7 9 7 9 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000300007 2005 2024-05-26 7 10 7 10 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200003 2009 2024-05-26 7 9 7 9 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000400010 2008 2024-05-26 7 7 7 7 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000300008 2005 2024-05-26 7 9 7 9 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000400008 2009 2024-05-26 7 7 7 7 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000400009 2006 2024-05-26 6 6 6 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000200009 2006 2024-05-26 6 7 6 7 -0002-7014 1851-8044 0002-7014 arg S0002-70142010000100007 2010 2024-05-26 6 6 6 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142007000300003 2007 2024-05-26 6 6 6 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000100022 2009 2024-05-26 6 6 6 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142010000100006 2010 2024-05-26 6 6 6 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200002 2009 2024-05-26 6 7 6 7 -0002-7014 1851-8044 0002-7014 arg S0002-70142010000100002 2010 2024-05-26 6 6 6 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142007000200014 2007 2024-05-26 6 6 6 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000100021 2009 2024-05-26 6 6 6 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142010000400010 2010 2024-05-26 6 6 6 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142010000200001 2010 2024-05-26 6 6 6 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142010000200002 2010 2024-05-26 6 7 6 7 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200014 2009 2024-05-26 5 6 5 6 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000100014 2009 2024-05-26 5 5 5 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000200009 2005 2024-05-26 5 5 5 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200004 2009 2024-05-26 5 5 5 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000100016 2006 2024-05-26 5 5 5 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000200015 2006 2024-05-26 5 5 5 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142007000300005 2007 2024-05-26 5 5 5 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000300009 2009 2024-05-26 5 5 5 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142010000200010 2010 2024-05-26 4 4 4 4 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000100015 2008 2024-05-26 3 4 3 4 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000300002 2005 2024-05-26 2 5 2 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200015 2009 2024-05-26 2 3 2 3 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000300001 2005 2024-05-26 2 5 2 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000300009 2005 2024-05-26 2 4 2 4 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200005 2009 2024-05-26 2 4 2 4 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200008 2009 2024-05-26 2 3 2 3 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000300006 2005 2024-05-26 2 5 2 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000300010 2005 2024-05-26 2 2 2 2 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000300003 2005 2024-05-26 2 5 2 5 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000300001 2006 2024-05-26 2 2 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000100005 2009 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200016 2009 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000400004 2005 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000100009 2008 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000100014 2005 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200009 2009 2024-05-26 1 2 1 2 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000100019 2006 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200013 2009 2024-05-26 1 3 1 3 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000400007 2008 2024-05-26 1 2 1 2 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000300010 2008 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200006 2009 2024-05-26 1 3 1 3 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000200018 2006 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000400002 2008 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142010000300005 2010 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000200007 2006 2024-05-26 1 3 1 3 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000400004 2006 2024-05-26 1 2 1 2 -0002-7014 1851-8044 0002-7014 arg S0002-70142007000100004 2007 2024-05-26 1 3 1 3 -0002-7014 1851-8044 0002-7014 arg S0002-70142007000200021 2007 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142007000100002 2007 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000100004 2009 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000400004 2009 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000400006 2008 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000400005 2006 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000300006 2008 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000400011 2008 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142007000300001 2007 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142007000100020 2007 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000400002 2006 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000100005 2005 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200017 2009 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000100005 2008 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000200007 2009 2024-05-26 1 4 1 4 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000100023 2009 2024-05-26 1 1 1 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000100008 2008 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000400008 2006 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000400005 2005 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000200006 2006 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000400007 2005 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000200013 2008 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000400003 2006 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142009000400006 2009 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142007000300008 2007 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000200008 2005 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000200006 2008 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000400004 2008 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142005000400006 2005 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142007000300006 2007 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000300003 2006 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142008000100007 2008 2024-05-26 0 1 0 1 -0002-7014 1851-8044 0002-7014 arg S0002-70142006000100009 2006 2024-05-26 0 1 0 1 diff --git a/metrics/fixtures/top100articles.tar.gz b/metrics/fixtures/top100articles.tar.gz deleted file mode 100644 index cd49556..0000000 Binary files a/metrics/fixtures/top100articles.tar.gz and /dev/null differ diff --git a/metrics/management/__init__.py b/metrics/management/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/metrics/management/__init__.py @@ -0,0 +1 @@ + diff --git a/metrics/management/commands/__init__.py b/metrics/management/commands/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/metrics/management/commands/__init__.py @@ -0,0 +1 @@ + diff --git a/metrics/management/commands/export_book_r51_monthly_metrics.py b/metrics/management/commands/export_book_r51_monthly_metrics.py new file mode 100644 index 0000000..9889387 --- /dev/null +++ b/metrics/management/commands/export_book_r51_monthly_metrics.py @@ -0,0 +1,431 @@ +import csv +import json +from collections import defaultdict +from pathlib import Path + +from device_detector import DeviceDetector +from django.core.management.base import BaseCommand, CommandError + +from collection.models import Collection +from document.models import Document +from metrics.counter import access, documents as index_docs +from resources.models import MMDB, RobotUserAgent +from scielo_usage_counter import log_handler, url_translator +from scielo_usage_counter.translator.books import URLTranslatorBooksSite +from source.models import Source + + +class Command(BaseCommand): + help = ( + "Generate COUNTER R5.1 monthly book metrics from one or more log files, " + "writing item and title CSV outputs." + ) + + def add_arguments(self, parser): + parser.add_argument( + "--input", + dest="inputs", + action="append", + required=True, + help="Input log file path. Repeat --input for multiple files.", + ) + parser.add_argument( + "--item-output", + required=True, + help="Output CSV path for item-level monthly metrics.", + ) + parser.add_argument( + "--title-output", + required=True, + help="Output CSV path for title-level monthly metrics.", + ) + parser.add_argument( + "--summary-output", + help="Optional JSON path with parse and totals summary.", + ) + parser.add_argument( + "--collection", + default="books", + help="Collection acronym (default: books).", + ) + parser.add_argument( + "--robots-source", + choices=sorted(RobotUserAgent.SOURCE_CHOICES), + default=RobotUserAgent.SOURCE_ALL, + help="Which active robot list to use: all, counter, or scielo.", + ) + + def handle(self, *args, **options): + input_paths = [Path(value).expanduser() for value in options["inputs"]] + item_output = Path(options["item_output"]).expanduser() + title_output = Path(options["title_output"]).expanduser() + summary_output = ( + Path(options["summary_output"]).expanduser() + if options.get("summary_output") + else None + ) + + for path in input_paths: + if not path.exists(): + raise CommandError(f"Input file not found: {path}") + + collection = Collection.objects.filter(acron3=options["collection"]).first() + if not collection: + raise CommandError(f"Collection not found: {options['collection']}") + + robots_source = options["robots_source"] + robots_list = RobotUserAgent.get_patterns(source=robots_source) + if not robots_list: + raise CommandError( + f"No robot user agents found in database for source {robots_source}." + ) + + mmdb = MMDB.objects.order_by("-created").first() + if not mmdb: + raise CommandError("No MMDB found in database.") + + parser = log_handler.LogParser( + mmdb_data=mmdb.data, + robots_list=robots_list, + output_mode="dict", + ) + utm = url_translator.URLTranslationManager( + documents_metadata=Document.metadata(collection=collection), + sources_metadata=Source.metadata(collection=collection), + translator=URLTranslatorBooksSite, + ) + + results = {} + parse_summaries = [] + ua_cache = {} + + for path in input_paths: + self.stdout.write(f"Processing {path}...") + parse_summaries.append( + self._parse_file( + path=path, + parser=parser, + utm=utm, + collection=collection, + ua_cache=ua_cache, + results=results, + ) + ) + + monthly_documents = self._build_monthly_documents(results) + + self._write_item_csv(item_output, monthly_documents["item"]) + self._write_title_csv(title_output, monthly_documents["title"]) + + summary = { + "robots_source": robots_source, + "raw_result_count": len(results), + "parse_summaries": parse_summaries, + "totals": { + "total_item_requests": sum( + doc.get("total_requests", 0) for doc in monthly_documents["item"] + ), + "total_item_investigations": sum( + doc.get("total_investigations", 0) + for doc in monthly_documents["item"] + ), + "unique_item_requests": sum( + doc.get("unique_requests", 0) for doc in monthly_documents["item"] + ), + "unique_item_investigations": sum( + doc.get("unique_investigations", 0) + for doc in monthly_documents["item"] + ), + "title_total_item_requests": sum( + doc.get("total_requests", 0) for doc in monthly_documents["title"] + ), + "title_total_item_investigations": sum( + doc.get("total_investigations", 0) + for doc in monthly_documents["title"] + ), + "unique_title_requests": sum( + doc.get("unique_requests", 0) for doc in monthly_documents["title"] + ), + "unique_title_investigations": sum( + doc.get("unique_investigations", 0) + for doc in monthly_documents["title"] + ), + }, + } + + if summary_output: + summary_output.parent.mkdir(parents=True, exist_ok=True) + summary_output.write_text(json.dumps(summary, indent=2, sort_keys=True)) + + self.stdout.write(self.style.SUCCESS(f"Item CSV written to {item_output}")) + self.stdout.write(self.style.SUCCESS(f"Title CSV written to {title_output}")) + if summary_output: + self.stdout.write(self.style.SUCCESS(f"Summary JSON written to {summary_output}")) + + def _parse_file(self, path, parser, utm, collection, ua_cache, results): + stats = defaultdict(int) + imported = 0 + + with path.open("rb") as fh: + for raw_line in fh: + stats["lines_parsed"] += 1 + + try: + line = raw_line.decode().strip() + except UnicodeDecodeError: + line = raw_line.decode("utf-8", errors="ignore").strip() + + match, ip_value = parser.match_with_best_pattern(line) + if not match: + stats["total_ignored_lines"] += 1 + continue + + data = match.groupdict() + is_bunny = "unix_ts" in data + method = "GET" if is_bunny else data.get("method") + status = data.get("status") + user_agent = parser.format_user_agent(data.get("user_agent")) + url = data.get("path") + ip_address = ip_value + + if not parser.has_valid_method(method): + stats["ignored_lines_invalid_method"] += 1 + stats["total_ignored_lines"] += 1 + continue + + if not parser.has_valid_status(status): + if parser.status_is_redirect(status): + stats["ignored_lines_http_redirects"] += 1 + elif parser.status_is_error(status): + stats["ignored_lines_http_errors"] += 1 + stats["total_ignored_lines"] += 1 + continue + + if parser.user_agent_is_bot(user_agent): + stats["ignored_lines_bot"] += 1 + stats["total_ignored_lines"] += 1 + continue + + if not parser.has_supported_url(url): + stats["ignored_lines_static_resources"] += 1 + stats["total_ignored_lines"] += 1 + continue + + if is_bunny: + local_datetime = parser.format_date(data.get("unix_ts"), None) + country_code = data.get("country") or parser.geoip.ip_to_country_code( + ip_address + ) + else: + local_datetime = parser.format_date(data.get("date"), data.get("timezone")) + country_code = parser.geoip.ip_to_country_code(ip_address) + + if not local_datetime: + stats["ignored_lines_invalid_local_datetime"] += 1 + stats["total_ignored_lines"] += 1 + continue + + if not country_code: + stats["ignored_lines_invalid_country_code"] += 1 + stats["total_ignored_lines"] += 1 + continue + + device = ua_cache.get(user_agent) + if device is None: + try: + device = DeviceDetector(user_agent).parse() + except ZeroDivisionError: + stats["ignored_lines_invalid_user_agent"] += 1 + stats["total_ignored_lines"] += 1 + ua_cache[user_agent] = False + continue + ua_cache[user_agent] = device + elif device is False: + stats["ignored_lines_invalid_user_agent"] += 1 + stats["total_ignored_lines"] += 1 + continue + + client_name = parser.format_client_name(device) + client_version = parser.format_client_version(device) + + if not client_name: + stats["ignored_lines_invalid_client_name"] += 1 + stats["total_ignored_lines"] += 1 + continue + + if not client_version: + stats["ignored_lines_invalid_client_version"] += 1 + stats["total_ignored_lines"] += 1 + continue + + translated = utm.translate(url) + item_access_data = access.extract_item_access_data( + collection.acron3, + translated, + ) + is_valid, _ = access.is_valid_item_access_data( + item_access_data, + utm, + ignore_utm_validation=True, + ) + if not is_valid: + stats["total_ignored_lines"] += 1 + continue + + access.update_results_with_item_access_data( + results, + item_access_data, + { + "client_name": client_name, + "client_version": client_version, + "ip_address": ip_address, + "country_code": country_code, + "local_datetime": local_datetime, + "url": url, + }, + ) + imported += 1 + stats["total_imported_lines"] += 1 + + return {"path": str(path), "valid_lines_used": imported, **stats} + + def _build_monthly_documents(self, results): + documents = index_docs.convert_raw_results_to_index_documents(results) + item_documents = {} + title_documents = {} + + for doc in documents["month"].values(): + year_month = doc.get("access_month", "") + scope = doc.get("metric_scope", "item") + if scope == "title": + key = ( + year_month, + doc.get("title_pid_generic") or doc.get("pid_generic"), + doc.get("document_type"), + ) + if key not in title_documents: + title_documents[key] = { + "year_month": year_month, + "title_pid_generic": doc.get("title_pid_generic") + or doc.get("pid_generic"), + "document_type": doc.get("document_type"), + "total_requests": 0, + "total_investigations": 0, + "unique_requests": 0, + "unique_investigations": 0, + } + title_documents[key]["total_requests"] += doc.get("total_requests", 0) + title_documents[key]["total_investigations"] += doc.get( + "total_investigations", 0 + ) + title_documents[key]["unique_requests"] += doc.get("unique_requests", 0) + title_documents[key]["unique_investigations"] += doc.get( + "unique_investigations", 0 + ) + continue + + key = ( + year_month, + doc.get("title_pid_generic"), + doc.get("pid_generic"), + doc.get("document_type"), + ) + if key not in item_documents: + item_documents[key] = { + "year_month": year_month, + "title_pid_generic": doc.get("title_pid_generic"), + "segment_pid_generic": doc.get("pid_generic"), + "document_type": doc.get("document_type"), + "total_requests": 0, + "total_investigations": 0, + "unique_requests": 0, + "unique_investigations": 0, + } + item_documents[key]["total_requests"] += doc.get("total_requests", 0) + item_documents[key]["total_investigations"] += doc.get( + "total_investigations", 0 + ) + item_documents[key]["unique_requests"] += doc.get("unique_requests", 0) + item_documents[key]["unique_investigations"] += doc.get( + "unique_investigations", 0 + ) + + return { + "item": list(item_documents.values()), + "title": list(title_documents.values()), + } + + @staticmethod + def _write_item_csv(path, item_documents): + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", newline="") as fh: + writer = csv.DictWriter( + fh, + fieldnames=[ + "year_month", + "title_pid_generic", + "segment_pid_generic", + "document_type", + "total_item_requests", + "total_item_investigations", + "unique_item_requests", + "unique_item_investigations", + ], + ) + writer.writeheader() + for doc in sorted( + item_documents, + key=lambda item: ( + item.get("year_month", ""), + item.get("title_pid_generic") or "", + item.get("segment_pid_generic") or "", + ), + ): + writer.writerow( + { + "year_month": doc.get("year_month", ""), + "title_pid_generic": doc.get("title_pid_generic"), + "segment_pid_generic": doc.get("segment_pid_generic"), + "document_type": doc.get("document_type"), + "total_item_requests": doc.get("total_requests", 0), + "total_item_investigations": doc.get("total_investigations", 0), + "unique_item_requests": doc.get("unique_requests", 0), + "unique_item_investigations": doc.get("unique_investigations", 0), + } + ) + + @staticmethod + def _write_title_csv(path, title_documents): + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", newline="") as fh: + writer = csv.DictWriter( + fh, + fieldnames=[ + "year_month", + "title_pid_generic", + "document_type", + "total_item_requests", + "total_item_investigations", + "unique_title_requests", + "unique_title_investigations", + ], + ) + writer.writeheader() + for doc in sorted( + title_documents, + key=lambda item: ( + item.get("year_month", ""), + item.get("title_pid_generic") or "", + ), + ): + writer.writerow( + { + "year_month": doc.get("year_month", ""), + "title_pid_generic": doc.get("title_pid_generic"), + "document_type": doc.get("document_type"), + "total_item_requests": doc.get("total_requests", 0), + "total_item_investigations": doc.get("total_investigations", 0), + "unique_title_requests": doc.get("unique_requests", 0), + "unique_title_investigations": doc.get("unique_investigations", 0), + } + ) diff --git a/metrics/management/commands/schedule_cleanup_daily_payloads.py b/metrics/management/commands/schedule_cleanup_daily_payloads.py new file mode 100644 index 0000000..285a23f --- /dev/null +++ b/metrics/management/commands/schedule_cleanup_daily_payloads.py @@ -0,0 +1,68 @@ +from django.core.management.base import BaseCommand + +from core.utils.scheduler import schedule_task +from metrics.tasks import task_cleanup_daily_payloads + + +class Command(BaseCommand): + help = ( + "Schedule the periodic cleanup of exported daily metric payload files. " + "Runs weekly on Sunday at 03:00 UTC by default, deleting payload files " + "for jobs that were exported more than 7 days ago." + ) + + def add_arguments(self, parser): + parser.add_argument( + "--day-of-week", + default="0", + help="Crontab day of week (0=Sunday, 6=Saturday). Default: 0", + ) + parser.add_argument( + "--hour", + default="3", + help="Crontab hour (0-23). Default: 3", + ) + parser.add_argument( + "--minute", + default="0", + help="Crontab minute (0-59). Default: 0", + ) + parser.add_argument( + "--older-than-days", + type=int, + default=7, + help="Only delete payloads exported more than N days ago. Default: 7", + ) + parser.add_argument( + "--collection", + action="append", + dest="collections", + help="Limit cleanup to a specific collection acronym. Repeat for multiple.", + ) + + def handle(self, *args, **options): + celery_task_name = task_cleanup_daily_payloads.name + + kwargs = { + "older_than_days": options["older_than_days"], + "collections": options.get("collections") or [], + } + + schedule_task( + task=celery_task_name, + name=celery_task_name, + kwargs=kwargs, + description="Weekly cleanup of exported daily payload files from disk.", + day_of_week=options["day_of_week"], + hour=options["hour"], + minute=options["minute"], + ) + + self.stdout.write( + self.style.SUCCESS( + f"Scheduled periodic task '{celery_task_name}' " + f"(day_of_week={options['day_of_week']}, hour={options['hour']}, " + f"minute={options['minute']}, older_than_days={kwargs['older_than_days']}, " + f"collections={kwargs['collections'] or 'all'})." + ) + ) diff --git a/metrics/migrations/0001_initial.py b/metrics/migrations/0001_initial.py index 30ccc96..9746d5f 100644 --- a/metrics/migrations/0001_initial.py +++ b/metrics/migrations/0001_initial.py @@ -1,4 +1,4 @@ -# Generated by Django 5.0.7 on 2024-08-30 00:52 +# Generated by Codex on 2026-04-27 import django.db.models.deletion from django.conf import settings @@ -9,13 +9,13 @@ class Migration(migrations.Migration): initial = True dependencies = [ - ("wagtaildocs", "0013_delete_uploadeddocument"), + ("collection", "0001_initial"), migrations.swappable_dependency(settings.AUTH_USER_MODEL), ] operations = [ migrations.CreateModel( - name="Top100ArticlesFile", + name="DailyMetricJob", fields=[ ( "id", @@ -28,133 +28,85 @@ class Migration(migrations.Migration): ), ( "created", - models.DateTimeField( - auto_now_add=True, verbose_name="Creation date" - ), + models.DateTimeField(auto_now_add=True, verbose_name="Creation date"), ), ( "updated", - models.DateTimeField( - auto_now=True, verbose_name="Last update date" - ), + models.DateTimeField(auto_now=True, verbose_name="Last update date"), + ), + ( + "access_date", + models.DateField(db_index=True, verbose_name="Access Date"), ), ( "status", models.CharField( choices=[ - ("QUE", "Queued"), - ("PAR", "Parsing"), - ("PRO", "Processed"), - ("INV", "Invalidated"), + ("PEN", "Pending"), + ("EXP", "Exporting"), + ("SUC", "Exported"), + ("ERR", "Error"), ], - default="QUE", - max_length=5, + db_index=True, + default="PEN", + max_length=3, + verbose_name="Status", ), ), ( - "attachment", - models.ForeignKey( - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="+", - to="wagtaildocs.document", - verbose_name="Attachment", - ), + "input_log_hashes", + models.JSONField(default=list, verbose_name="Input Log Hashes"), ), ( - "creator", - models.ForeignKey( - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_creator", - to=settings.AUTH_USER_MODEL, - verbose_name="Creator", + "storage_path", + models.CharField( + blank=True, + default="", + max_length=500, + verbose_name="Storage Path", ), ), ( - "updated_by", - models.ForeignKey( + "payload_hash", + models.CharField( blank=True, - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_last_mod_user", - to=settings.AUTH_USER_MODEL, - verbose_name="Updater", + default="", + max_length=64, + verbose_name="Payload Hash", ), ), - ], - options={ - "verbose_name": "Top 100 Articles File", - "verbose_name_plural": "Top 100 Articles Files", - }, - ), - migrations.CreateModel( - name="Top100Articles", - fields=[ ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), + "summary", + models.JSONField(blank=True, default=dict, verbose_name="Summary"), ), ( - "created", - models.DateTimeField( - auto_now_add=True, verbose_name="Creation date" - ), + "attempts", + models.PositiveIntegerField(default=0, verbose_name="Attempts"), ), ( - "updated", - models.DateTimeField( - auto_now=True, verbose_name="Last update date" - ), + "error_message", + models.TextField(blank=True, default="", verbose_name="Error Message"), ), - ("pid_issn", models.CharField(max_length=9, verbose_name="PID ISSN")), - ("year_month_day", models.DateField(verbose_name="Date of access")), ( - "print_issn", - models.CharField( - blank=True, max_length=9, null=True, verbose_name="Print ISSN" + "export_started_at", + models.DateTimeField( + blank=True, + null=True, + verbose_name="Export Started At", ), ), ( - "online_issn", - models.CharField( - blank=True, max_length=9, null=True, verbose_name="Online ISSN" - ), + "exported_at", + models.DateTimeField(blank=True, null=True, verbose_name="Exported At"), ), ( "collection", - models.CharField(max_length=3, verbose_name="Collection Acronym 3"), - ), - ("pid", models.CharField(verbose_name="Publication ID")), - ( - "yop", - models.PositiveSmallIntegerField( - verbose_name="Year of Publication" + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="collection.collection", + verbose_name="Collection", ), ), - ( - "total_item_requests", - models.IntegerField(verbose_name="Total Item Requests"), - ), - ( - "total_item_investigations", - models.IntegerField(verbose_name="Total Item Investigations"), - ), - ( - "unique_item_requests", - models.IntegerField(verbose_name="Unique Item Requests"), - ), - ( - "unique_item_investigations", - models.IntegerField(verbose_name="Unique Item Investigations"), - ), ( "creator", models.ForeignKey( @@ -180,18 +132,23 @@ class Migration(migrations.Migration): ), ], options={ - "verbose_name_plural": "Top 100 Articles", - "indexes": [ - models.Index( - fields=["pid_issn"], name="metrics_top_pid_iss_c1fba9_idx" - ), - models.Index( - fields=["year_month_day"], name="metrics_top_year_mo_8cda7b_idx" - ), - ], - "unique_together": { - ("collection", "pid_issn", "pid", "year_month_day") - }, + "verbose_name": "Daily Metric Job", + "verbose_name_plural": "Daily Metric Jobs", + "unique_together": {("collection", "access_date")}, }, ), + migrations.AddIndex( + model_name="dailymetricjob", + index=models.Index( + fields=["collection", "access_date"], + name="metrics_daily_coll_date_idx", + ), + ), + migrations.AddIndex( + model_name="dailymetricjob", + index=models.Index( + fields=["status", "export_started_at"], + name="metrics_daily_status_exp_idx", + ), + ), ] diff --git a/metrics/migrations/0002_alter_top100articlesfile_status.py b/metrics/migrations/0002_alter_top100articlesfile_status.py deleted file mode 100644 index b2b98c5..0000000 --- a/metrics/migrations/0002_alter_top100articlesfile_status.py +++ /dev/null @@ -1,27 +0,0 @@ -# Generated by Django 5.0.7 on 2024-08-30 21:27 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("metrics", "0001_initial"), - ] - - operations = [ - migrations.AlterField( - model_name="top100articlesfile", - name="status", - field=models.CharField( - choices=[ - ("QUE", "Queued"), - ("PAR", "Parsing"), - ("PRO", "Processed"), - ("ERR", "Error"), - ("INV", "Invalidated"), - ], - default="QUE", - max_length=5, - ), - ), - ] diff --git a/metrics/migrations/0003_remove_top100articlesfile_attachment_and_more.py b/metrics/migrations/0003_remove_top100articlesfile_attachment_and_more.py deleted file mode 100644 index 8b01d80..0000000 --- a/metrics/migrations/0003_remove_top100articlesfile_attachment_and_more.py +++ /dev/null @@ -1,187 +0,0 @@ -# Generated by Django 5.0.7 on 2025-03-07 16:55 - -import django.db.models.deletion -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("article", "0001_initial"), - ("collection", "0001_initial"), - ("journal", "0001_initial"), - ("metrics", "0002_alter_top100articlesfile_status"), - ] - - operations = [ - migrations.RemoveField( - model_name="top100articlesfile", - name="attachment", - ), - migrations.RemoveField( - model_name="top100articlesfile", - name="creator", - ), - migrations.RemoveField( - model_name="top100articlesfile", - name="updated_by", - ), - migrations.CreateModel( - name="Item", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "article", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - to="article.article", - verbose_name="Article", - ), - ), - ( - "collection", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - to="collection.collection", - verbose_name="Collection", - ), - ), - ( - "journal", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - to="journal.journal", - verbose_name="Journal", - ), - ), - ], - options={ - "verbose_name": "Item", - "verbose_name_plural": "Items", - }, - ), - migrations.CreateModel( - name="UserAgent", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "name", - models.CharField( - db_index=True, max_length=255, verbose_name="Name" - ), - ), - ( - "version", - models.CharField( - db_index=True, max_length=255, verbose_name="Version" - ), - ), - ], - options={ - "verbose_name": "User Agent", - "verbose_name_plural": "User Agents", - "unique_together": {("name", "version")}, - }, - ), - migrations.CreateModel( - name="UserSession", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ("datetime", models.DateTimeField(verbose_name="Datetime")), - ( - "user_ip", - models.CharField( - db_index=True, max_length=255, verbose_name="User IP" - ), - ), - ( - "user_agent", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - to="metrics.useragent", - verbose_name="User Agent", - ), - ), - ], - options={ - "verbose_name": "User Session", - "verbose_name_plural": "User Sessions", - }, - ), - migrations.CreateModel( - name="ItemAccess", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "country_code", - models.CharField( - db_index=True, max_length=2, verbose_name="Country" - ), - ), - ( - "media_language", - models.CharField( - db_index=True, max_length=2, verbose_name="Media Language" - ), - ), - ( - "media_format", - models.CharField(max_length=10, verbose_name="Media Format"), - ), - ( - "item", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - to="metrics.item", - verbose_name="Item", - ), - ), - ( - "user_session", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - to="metrics.usersession", - verbose_name="User Session", - ), - ), - ], - options={ - "verbose_name": "Item Access", - "verbose_name_plural": "Items Access", - }, - ), - migrations.DeleteModel( - name="Top100Articles", - ), - ] diff --git a/metrics/migrations/0004_delete_top100articlesfile_and_more.py b/metrics/migrations/0004_delete_top100articlesfile_and_more.py deleted file mode 100644 index b10c41b..0000000 --- a/metrics/migrations/0004_delete_top100articlesfile_and_more.py +++ /dev/null @@ -1,49 +0,0 @@ -# Generated by Django 5.0.7 on 2025-03-07 16:55 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("metrics", "0003_remove_top100articlesfile_attachment_and_more"), - ("tracker", "0003_logfilediscardedline_delete_top100articlesfileevent"), - ] - - operations = [ - migrations.DeleteModel( - name="Top100ArticlesFile", - ), - migrations.AddIndex( - model_name="item", - index=models.Index( - fields=["collection", "journal", "article"], - name="metrics_ite_collect_6971a5_idx", - ), - ), - migrations.AddIndex( - model_name="item", - index=models.Index( - fields=["collection", "journal"], name="metrics_ite_collect_b5f79b_idx" - ), - ), - migrations.AlterUniqueTogether( - name="item", - unique_together={("collection", "journal", "article")}, - ), - migrations.AlterUniqueTogether( - name="usersession", - unique_together={("datetime", "user_agent", "user_ip")}, - ), - migrations.AlterUniqueTogether( - name="itemaccess", - unique_together={ - ( - "item", - "user_session", - "country_code", - "media_format", - "media_language", - ) - }, - ), - ] diff --git a/metrics/migrations/0005_alter_itemaccess_unique_together_and_more.py b/metrics/migrations/0005_alter_itemaccess_unique_together_and_more.py deleted file mode 100644 index 7bfafff..0000000 --- a/metrics/migrations/0005_alter_itemaccess_unique_together_and_more.py +++ /dev/null @@ -1,49 +0,0 @@ -# Generated by Django 5.0.7 on 2025-03-27 20:40 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("metrics", "0004_delete_top100articlesfile_and_more"), - ] - - operations = [ - migrations.AlterUniqueTogether( - name="itemaccess", - unique_together=set(), - ), - migrations.AddField( - model_name="itemaccess", - name="click_timestamps", - field=models.JSONField(default=dict, verbose_name="Click Timestamps"), - ), - migrations.AddField( - model_name="itemaccess", - name="content_type", - field=models.CharField( - default="undefined", max_length=16, verbose_name="Content Type" - ), - preserve_default=False, - ), - migrations.AlterField( - model_name="itemaccess", - name="media_format", - field=models.CharField( - db_index=True, max_length=10, verbose_name="Media Format" - ), - ), - migrations.AlterUniqueTogether( - name="itemaccess", - unique_together={ - ( - "item", - "user_session", - "country_code", - "media_format", - "media_language", - "content_type", - ) - }, - ), - ] diff --git a/metrics/migrations/0006_alter_itemaccess_content_type.py b/metrics/migrations/0006_alter_itemaccess_content_type.py deleted file mode 100644 index 0e81287..0000000 --- a/metrics/migrations/0006_alter_itemaccess_content_type.py +++ /dev/null @@ -1,17 +0,0 @@ -# Generated by Django 5.0.7 on 2025-03-31 21:07 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("metrics", "0005_alter_itemaccess_unique_together_and_more"), - ] - - operations = [ - migrations.AlterField( - model_name="itemaccess", - name="content_type", - field=models.CharField(max_length=32, verbose_name="Content Type"), - ), - ] diff --git a/metrics/migrations/0007_alter_usersession_datetime_and_more.py b/metrics/migrations/0007_alter_usersession_datetime_and_more.py deleted file mode 100644 index e45036e..0000000 --- a/metrics/migrations/0007_alter_usersession_datetime_and_more.py +++ /dev/null @@ -1,23 +0,0 @@ -# Generated by Django 5.0.7 on 2025-06-12 17:16 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("metrics", "0006_alter_itemaccess_content_type"), - ] - - operations = [ - migrations.AlterField( - model_name="usersession", - name="datetime", - field=models.DateTimeField(db_index=True, verbose_name="Datetime"), - ), - migrations.AddIndex( - model_name="itemaccess", - index=models.Index( - fields=["item", "user_session"], name="metrics_ite_item_id_8799c9_idx" - ), - ), - ] diff --git a/metrics/migrations/0008_remove_a_few_models.py b/metrics/migrations/0008_remove_a_few_models.py deleted file mode 100644 index dfd14ec..0000000 --- a/metrics/migrations/0008_remove_a_few_models.py +++ /dev/null @@ -1,48 +0,0 @@ -# Generated by Django 5.0.7 on 2025-06-22 17:45 - -from django.db import migrations - - -class Migration(migrations.Migration): - dependencies = [ - ("metrics", "0007_alter_usersession_datetime_and_more"), - ] - - operations = [ - migrations.AlterUniqueTogether( - name="itemaccess", - unique_together=None, - ), - migrations.AlterUniqueTogether( - name="useragent", - unique_together=None, - ), - migrations.AlterUniqueTogether( - name="usersession", - unique_together=None, - ), - migrations.RemoveField( - model_name="itemaccess", - name="user_session", - ), - migrations.RemoveField( - model_name="usersession", - name="user_agent", - ), - migrations.RemoveField( - model_name="itemaccess", - name="item", - ), - migrations.DeleteModel( - name="Item", - ), - migrations.DeleteModel( - name="ItemAccess", - ), - migrations.DeleteModel( - name="UserAgent", - ), - migrations.DeleteModel( - name="UserSession", - ), - ] diff --git a/metrics/models.py b/metrics/models.py index e69de29..aa789b5 100644 --- a/metrics/models.py +++ b/metrics/models.py @@ -0,0 +1,108 @@ +from django.db import models +from django.utils.translation import gettext_lazy as _ + +from collection.models import Collection +from core.models import CommonControlField + + +class DailyMetricJob(CommonControlField): + STATUS_PENDING = "PEN" + STATUS_EXPORTING = "EXP" + STATUS_EXPORTED = "SUC" + STATUS_ERROR = "ERR" + STATUS_CHOICES = ( + (STATUS_PENDING, _("Pending")), + (STATUS_EXPORTING, _("Exporting")), + (STATUS_EXPORTED, _("Exported")), + (STATUS_ERROR, _("Error")), + ) + + collection = models.ForeignKey( + Collection, + verbose_name=_("Collection"), + on_delete=models.CASCADE, + db_index=True, + ) + + access_date = models.DateField( + verbose_name=_("Access Date"), + db_index=True, + ) + + status = models.CharField( + verbose_name=_("Status"), + max_length=3, + choices=STATUS_CHOICES, + default=STATUS_PENDING, + db_index=True, + ) + + input_log_hashes = models.JSONField( + verbose_name=_("Input Log Hashes"), + default=list, + ) + + storage_path = models.CharField( + verbose_name=_("Storage Path"), + max_length=500, + blank=True, + default="", + ) + + payload_hash = models.CharField( + verbose_name=_("Payload Hash"), + max_length=64, + blank=True, + default="", + ) + + summary = models.JSONField( + verbose_name=_("Summary"), + default=dict, + blank=True, + ) + + attempts = models.PositiveIntegerField( + verbose_name=_("Attempts"), + default=0, + ) + + error_message = models.TextField( + verbose_name=_("Error Message"), + blank=True, + default="", + ) + + export_started_at = models.DateTimeField( + verbose_name=_("Export Started At"), + null=True, + blank=True, + ) + + exported_at = models.DateTimeField( + verbose_name=_("Exported At"), + null=True, + blank=True, + ) + + @property + def input_log_count(self): + return len(self.input_log_hashes or []) + + @property + def job_id(self): + if not self.payload_hash: + return "" + return f"{self.collection.acron3}|{self.access_date.isoformat()}|{self.payload_hash}" + + class Meta: + verbose_name = _("Daily Metric Job") + verbose_name_plural = _("Daily Metric Jobs") + unique_together = (("collection", "access_date"),) + indexes = [ + models.Index(fields=["collection", "access_date"], name="metrics_daily_coll_date_idx"), + models.Index(fields=["status", "export_started_at"], name="metrics_daily_status_exp_idx"), + ] + + def __str__(self): + return f"{self.collection.acron3}-{self.access_date}" diff --git a/metrics/opensearch/__init__.py b/metrics/opensearch/__init__.py new file mode 100644 index 0000000..fb9df20 --- /dev/null +++ b/metrics/opensearch/__init__.py @@ -0,0 +1,8 @@ +from .client import OpenSearchUsageClient +from .mappings import ( + BOOKS_MONTH_INDEX_MAPPINGS, + BOOKS_YEAR_INDEX_MAPPINGS, + MONTH_INDEX_MAPPINGS, + YEAR_INDEX_MAPPINGS, + get_index_mappings, +) diff --git a/metrics/opensearch/client.py b/metrics/opensearch/client.py new file mode 100644 index 0000000..ce0de5c --- /dev/null +++ b/metrics/opensearch/client.py @@ -0,0 +1,225 @@ +import logging + +from django.conf import settings +from opensearchpy import NotFoundError, OpenSearch, helpers + +from metrics.opensearch.names import generate_month_index_name, generate_year_index_name + +from .mappings import get_index_mappings +from .scripts import ( + IDEMPOTENT_JOB_INCREMENT_SCRIPT, + METRIC_FIELDS, + build_idempotent_job_increment_action, + merge_metric_document, +) + + +class OpenSearchUsageClient: + def __init__(self, url=None, basic_auth=None, api_key=None, verify_certs=None): + self.client = self.get_opensearch_client(url, basic_auth, api_key, verify_certs) + + def get_opensearch_client(self, url=None, basic_auth=None, api_key=None, verify_certs=None): + url = url or getattr(settings, "OPENSEARCH_URL", None) + basic_auth = basic_auth or getattr(settings, "OPENSEARCH_BASIC_AUTH", None) + api_key = api_key or getattr(settings, "OPENSEARCH_API_KEY", None) + if verify_certs is None: + verify_certs = getattr(settings, "OPENSEARCH_VERIFY_CERTS", False) + + if basic_auth: + return OpenSearch(url, http_auth=tuple(basic_auth), verify_certs=verify_certs) + if api_key: + return OpenSearch(url, api_key=api_key, verify_certs=verify_certs) + return OpenSearch(url, verify_certs=verify_certs) + + def ping(self): + try: + return self.client.ping() + except Exception as exc: + logging.error("Error pinging OpenSearch client: %s", exc) + return False + + def create_index(self, index_name, mappings, ping_client=False): + if ping_client and not self.ping(): + return + + response = self.client.indices.create( + index=index_name, + body={ + "settings": {"index": {"number_of_replicas": 0}}, + "mappings": mappings, + }, + ) + logging.info("Index %s created: %s", index_name, response) + + def create_index_if_not_exists(self, index_name, mappings, ping_client=False): + if ping_client and not self.ping(): + return + + if not self.client.indices.exists(index=index_name): + self.create_index(index_name=index_name, mappings=mappings, ping_client=False) + + def ensure_usage_indexes(self, collection, access_date, index_prefix=None): + index_prefix = index_prefix or getattr(settings, "OPENSEARCH_INDEX_NAME", "usage") + year_index = generate_year_index_name(index_prefix, collection, access_date) + month_index = generate_month_index_name(index_prefix, collection, access_date) + + self.create_index_if_not_exists(year_index, get_index_mappings(collection, "year")) + self.create_index_if_not_exists(month_index, get_index_mappings(collection, "month")) + + return {"year": year_index, "month": month_index} + + def delete_index(self, index_name, ping_client=False): + if ping_client and not self.ping(): + return + self.client.indices.delete(index=index_name) + + def index_documents(self, index_name, documents, ping_client=False): + if ping_client and not self.ping(): + return + + if not documents: + return + + helpers.bulk( + self.client, + ( + {"_index": index_name, "_id": doc_id, "_source": document} + for doc_id, document in documents.items() + ), + ) + + def increment_documents_for_daily_job( + self, + index_name, + documents, + job_id, + ping_client=False, + ): + if ping_client and not self.ping(): + return + + if not documents: + return + + helpers.bulk( + self.client, + ( + build_idempotent_job_increment_action( + index_name=index_name, + doc_id=doc_id, + document=document, + job_id=job_id, + ) + for doc_id, document in documents.items() + ), + ) + + def delete_documents(self, index_name, doc_ids, ping_client=False): + if ping_client and not self.ping(): + return + + if not doc_ids: + return + + helpers.bulk( + self.client, + ( + {"_op_type": "delete", "_index": index_name, "_id": doc_id} + for doc_id in doc_ids + ), + ) + + def delete_documents_by_key(self, index_name, data, ping_client=False): + if ping_client and not self.ping(): + return False + + query = { + "query": { + "bool": { + "must": [ + { + "terms": { + key: values if isinstance(values, list) else [values], + } + } + for key, values in data.items() + ] + } + } + } + + try: + self.client.delete_by_query(index=index_name, body=query) + return True + except Exception as exc: + logging.error("Failed to delete documents from %s: %s", index_name, exc) + return False + + def fetch_documents_by_ids(self, index_name, doc_ids, ping_client=False): + if ping_client and not self.ping(): + return {} + + if not doc_ids: + return {} + + try: + response = self.client.mget(index=index_name, body={"ids": doc_ids}) + except NotFoundError: + return {} + + documents = {} + for document in response.get("docs", []): + if document.get("found"): + documents[document["_id"]] = document["_source"] + return documents + + def fetch_documents_by_key(self, index_name, data, ping_client=False): + if ping_client and not self.ping(): + return {} + + query = { + "query": { + "bool": { + "must": [ + { + "terms": { + key: values if isinstance(values, list) else [values], + } + } + for key, values in data.items() + ] + } + } + } + + try: + return { + hit["_id"]: hit["_source"] + for hit in helpers.scan(self.client, index=index_name, query=query) + } + except NotFoundError: + return {} + + def sync_documents(self, index_name, documents, operation="add", ping_client=False): + if ping_client and not self.ping(): + return + + if not documents: + return + + existing_documents = self.fetch_documents_by_ids(index_name=index_name, doc_ids=list(documents.keys())) + upserts = {} + deletes = [] + + for doc_id, document in documents.items(): + merged = merge_metric_document(existing_documents.get(doc_id), document, operation=operation) + if merged is None: + if doc_id in existing_documents: + deletes.append(doc_id) + continue + upserts[doc_id] = merged + + if upserts: + self.index_documents(index_name=index_name, documents=upserts) + if deletes: + self.delete_documents(index_name=index_name, doc_ids=deletes) diff --git a/metrics/opensearch/mappings.py b/metrics/opensearch/mappings.py new file mode 100644 index 0000000..5825c1b --- /dev/null +++ b/metrics/opensearch/mappings.py @@ -0,0 +1,177 @@ +YEAR_INDEX_MAPPINGS = { + "properties": { + "collection": {"type": "keyword"}, + "source": { + "properties": { + "source_type": {"type": "keyword"}, + "source_id": {"type": "keyword"}, + "scielo_issn": {"type": "keyword"}, + "main_title": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 512 + } + } + }, + "subject_area_capes": {"type": "keyword"}, + "subject_area_wos": {"type": "keyword"}, + "acronym": {"type": "keyword"}, + "publisher_name": {"type": "keyword"}, + "access_type": {"type": "keyword"}, + "city": {"type": "keyword"}, + "country": {"type": "keyword"}, + "identifiers": {"type": "object"}, + } + }, + "document_type": {"type": "keyword"}, + "scielo_document_type": {"type": "keyword"}, + "metric_scope": {"type": "keyword"}, + "counter_data_type": {"type": "keyword"}, + "parent_data_type": {"type": "keyword"}, + "article_version": {"type": "keyword"}, + "pid": {"type": "keyword"}, + "pid_v2": {"type": "keyword"}, + "pid_v3": {"type": "keyword"}, + "pid_generic": {"type": "keyword"}, + "publication_year": {"type": "integer"}, + "counter_access_type": {"type": "keyword"}, + "access_method": {"type": "keyword"}, + "access_year": {"type": "date", "format": "yyyy"}, + "access_country_code": {"type": "keyword"}, + "content_language": {"type": "keyword"}, + "applied_jobs": {"type": "keyword", "index": False}, + "total_requests": {"type": "integer"}, + "total_investigations": {"type": "integer"}, + "unique_requests": {"type": "integer"}, + "unique_investigations": {"type": "integer"}, + } +} + + +MONTH_INDEX_MAPPINGS = { + "properties": { + "collection": {"type": "keyword"}, + "source": YEAR_INDEX_MAPPINGS["properties"]["source"], + "document_type": {"type": "keyword"}, + "scielo_document_type": {"type": "keyword"}, + "metric_scope": {"type": "keyword"}, + "counter_data_type": {"type": "keyword"}, + "parent_data_type": {"type": "keyword"}, + "article_version": {"type": "keyword"}, + "pid": {"type": "keyword"}, + "pid_v2": {"type": "keyword"}, + "pid_v3": {"type": "keyword"}, + "pid_generic": {"type": "keyword"}, + "publication_year": {"type": "integer"}, + "counter_access_type": {"type": "keyword"}, + "access_method": {"type": "keyword"}, + "access_month": {"type": "date", "format": "yyyy-MM"}, + "applied_jobs": {"type": "keyword", "index": False}, + "daily_metrics": {"type": "object", "dynamic": True}, + "total_requests": {"type": "integer"}, + "total_investigations": {"type": "integer"}, + "unique_requests": {"type": "integer"}, + "unique_investigations": {"type": "integer"}, + } +} + + +BOOKS_YEAR_INDEX_MAPPINGS = { + "properties": { + "collection": {"type": "keyword"}, + "source": { + "properties": { + "source_type": {"type": "keyword"}, + "source_id": {"type": "keyword"}, + "main_title": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 512 + } + } + }, + "access_type": {"type": "keyword"}, + "publisher": {"type": "keyword"}, + "city": {"type": "keyword"}, + "country": {"type": "keyword"}, + "identifiers": { + "properties": { + "book_id": {"type": "keyword"}, + "isbn": {"type": "keyword"}, + "eisbn": {"type": "keyword"}, + "doi": {"type": "keyword"}, + } + }, + } + }, + "document_type": {"type": "keyword"}, + "scielo_document_type": {"type": "keyword"}, + "metric_scope": {"type": "keyword"}, + "counter_data_type": {"type": "keyword"}, + "parent_data_type": {"type": "keyword"}, + "article_version": {"type": "keyword"}, + "pid": {"type": "keyword"}, + "pid_generic": {"type": "keyword"}, + "title_pid_generic": {"type": "keyword"}, + "publication_year": {"type": "integer"}, + "counter_access_type": {"type": "keyword"}, + "access_method": {"type": "keyword"}, + "access_year": {"type": "date", "format": "yyyy"}, + "access_country_code": {"type": "keyword"}, + "content_language": {"type": "keyword"}, + "applied_jobs": {"type": "keyword", "index": False}, + "total_requests": {"type": "integer"}, + "total_investigations": {"type": "integer"}, + "unique_requests": {"type": "integer"}, + "unique_investigations": {"type": "integer"}, + } +} + + +BOOKS_MONTH_INDEX_MAPPINGS = { + "properties": { + "collection": {"type": "keyword"}, + "source": BOOKS_YEAR_INDEX_MAPPINGS["properties"]["source"], + "document_type": {"type": "keyword"}, + "scielo_document_type": {"type": "keyword"}, + "metric_scope": {"type": "keyword"}, + "counter_data_type": {"type": "keyword"}, + "parent_data_type": {"type": "keyword"}, + "article_version": {"type": "keyword"}, + "pid": {"type": "keyword"}, + "pid_generic": {"type": "keyword"}, + "title_pid_generic": {"type": "keyword"}, + "publication_year": {"type": "integer"}, + "counter_access_type": {"type": "keyword"}, + "access_method": {"type": "keyword"}, + "access_month": {"type": "date", "format": "yyyy-MM"}, + "applied_jobs": {"type": "keyword", "index": False}, + "daily_metrics": {"type": "object", "dynamic": True}, + "total_requests": {"type": "integer"}, + "total_investigations": {"type": "integer"}, + "unique_requests": {"type": "integer"}, + "unique_investigations": {"type": "integer"}, + } +} + + +METRIC_FIELDS = ( + "total_requests", + "total_investigations", + "unique_requests", + "unique_investigations", +) + + +def get_index_mappings(collection, granularity): + if granularity not in {"month", "year"}: + raise ValueError("Granularity must be 'month' or 'year'.") + + if collection == "books": + return BOOKS_MONTH_INDEX_MAPPINGS if granularity == "month" else BOOKS_YEAR_INDEX_MAPPINGS + + return MONTH_INDEX_MAPPINGS if granularity == "month" else YEAR_INDEX_MAPPINGS diff --git a/metrics/opensearch/names.py b/metrics/opensearch/names.py new file mode 100644 index 0000000..1ecd493 --- /dev/null +++ b/metrics/opensearch/names.py @@ -0,0 +1,41 @@ +from django.conf import settings + + +def _validate_index_inputs(index_prefix: str, collection: str, date: str): + if not date or not isinstance(date, str): + raise ValueError("Date must be a non-empty string in 'YYYY-MM-DD' format.") + if not collection or not isinstance(collection, str): + raise ValueError("Collection must be a non-empty string.") + if not index_prefix or not isinstance(index_prefix, str): + raise ValueError("Index prefix must be a non-empty string.") + + +def _get_collection_size(collection: str) -> str: + return getattr(settings, "COLLECTION_ACRON3_SIZE_MAP", {}).get(collection, "small") + + +def extract_access_year(date: str) -> str: + _validate_index_inputs("usage", "tmp", date) + return date.split("-")[0] + + +def extract_access_month(date: str) -> str: + _validate_index_inputs("usage", "tmp", date) + year, month, _ = date.split("-") + return f"{year}{month}" + + +def generate_month_index_name(index_prefix: str, collection: str, date: str) -> str: + _validate_index_inputs(index_prefix, collection, date) + size = _get_collection_size(collection) + if size in ("xlarge", "large"): + return f"{index_prefix}_monthly_{collection}_{extract_access_year(date)}" + return f"{index_prefix}_monthly_{collection}" + + +def generate_year_index_name(index_prefix: str, collection: str, date: str) -> str: + _validate_index_inputs(index_prefix, collection, date) + size = _get_collection_size(collection) + if size in ("xlarge", "large"): + return f"{index_prefix}_yearly_{collection}_{extract_access_year(date)}" + return f"{index_prefix}_yearly_{collection}" diff --git a/metrics/opensearch/scripts.py b/metrics/opensearch/scripts.py new file mode 100644 index 0000000..a6a5e1c --- /dev/null +++ b/metrics/opensearch/scripts.py @@ -0,0 +1,102 @@ +METRIC_FIELDS = ( + "total_requests", + "total_investigations", + "unique_requests", + "unique_investigations", +) + +IDEMPOTENT_JOB_INCREMENT_SCRIPT = """ +if (ctx._source.applied_jobs == null) { + ctx._source.applied_jobs = []; +} +if (ctx._source.applied_jobs.contains(params.job_id)) { + ctx.op = 'none'; + return; +} +for (entry in params.document.entrySet()) { + if (!params.metric_fields.contains(entry.getKey()) && !'applied_jobs'.equals(entry.getKey()) && !'daily_metrics'.equals(entry.getKey())) { + if (!ctx._source.containsKey(entry.getKey()) || ctx._source[entry.getKey()] != entry.getValue()) { + ctx._source[entry.getKey()] = entry.getValue(); + } + } +} +for (field in params.metric_fields) { + def currentValue = ctx._source.containsKey(field) ? ctx._source[field] : 0; + def increment = params.document.containsKey(field) ? params.document[field] : 0; + ctx._source[field] = currentValue + increment; +} +if (params.document.containsKey('daily_metrics')) { + if (!ctx._source.containsKey('daily_metrics') || ctx._source.daily_metrics == null) { + ctx._source.daily_metrics = new HashMap(); + } + for (dayEntry in params.document.daily_metrics.entrySet()) { + def day = dayEntry.getKey(); + def dayMetrics = dayEntry.getValue(); + if (!ctx._source.daily_metrics.containsKey(day) || ctx._source.daily_metrics[day] == null) { + ctx._source.daily_metrics[day] = new HashMap(); + } + for (metric in params.metric_fields) { + def currentValue = ctx._source.daily_metrics[day].containsKey(metric) ? ctx._source.daily_metrics[day][metric] : 0; + def increment = dayMetrics.containsKey(metric) ? dayMetrics[metric] : 0; + ctx._source.daily_metrics[day][metric] = currentValue + increment; + } + } +} +ctx._source.applied_jobs.add(params.job_id); +""" + + +def build_idempotent_job_increment_action(index_name, doc_id, document, job_id): + return { + "_op_type": "update", + "_index": index_name, + "_id": doc_id, + "retry_on_conflict": 5, + "scripted_upsert": True, + "script": { + "lang": "painless", + "source": IDEMPOTENT_JOB_INCREMENT_SCRIPT, + "params": { + "document": document, + "job_id": job_id, + "metric_fields": list(METRIC_FIELDS), + }, + }, + "upsert": { + "applied_jobs": [], + }, + } + + +def merge_metric_document(existing, current, operation="add"): + if existing is None: + if operation == "subtract": + return None + return current + + merged = dict(existing) + merged.update( + { + key: value + for key, value in current.items() + if key not in METRIC_FIELDS and key != "daily_metrics" + } + ) + + signal = -1 if operation == "subtract" else 1 + for field in METRIC_FIELDS: + merged[field] = existing.get(field, 0) + signal * current.get(field, 0) + + if "daily_metrics" in current: + merged_daily = dict(existing.get("daily_metrics") or {}) + for day, metrics in current["daily_metrics"].items(): + day_merged = dict(merged_daily.get(day) or {}) + for field in METRIC_FIELDS: + day_merged[field] = day_merged.get(field, 0) + signal * metrics.get(field, 0) + merged_daily[day] = day_merged + merged["daily_metrics"] = merged_daily + + if all(merged.get(field, 0) <= 0 for field in METRIC_FIELDS): + return None + + return merged diff --git a/metrics/services/__init__.py b/metrics/services/__init__.py new file mode 100644 index 0000000..b305681 --- /dev/null +++ b/metrics/services/__init__.py @@ -0,0 +1,26 @@ +from .jobs import ( + acquire_daily_metric_job, + create_or_update_daily_metric_job, + mark_daily_metric_job_exported, + mark_daily_metric_job_failed, + release_stale_daily_metric_jobs, +) +from .resources import ( + build_search_client, + extract_celery_queue_name, + fetch_required_resources, + get_log_files_for_collection_date, +) +from .parser import ( + is_stale_parsing_log, + process_daily_metric_job, + process_line, + requeue_stale_parsing_log, + setup_parsing_environment, + touch_parse_heartbeat, +) +from .export import ( + export_daily_metric_payload, + export_documents, + load_daily_metric_payload, +) diff --git a/metrics/services/daily_payloads.py b/metrics/services/daily_payloads.py new file mode 100644 index 0000000..0e06af9 --- /dev/null +++ b/metrics/services/daily_payloads.py @@ -0,0 +1,127 @@ +import hashlib +import json +import logging +import os +from datetime import timedelta +from pathlib import Path + +from django.conf import settings +from django.utils import timezone + + +def get_daily_payload_root(): + return Path(settings.MEDIA_ROOT) / "metrics" / "daily_payloads" + + +def build_daily_storage_path(collection, access_date): + return ( + Path(collection.acron3) + / access_date.strftime("%Y") + / access_date.strftime("%m") + / f"{access_date.isoformat()}.json" + ) + + +def resolve_storage_path(storage_path): + return get_daily_payload_root() / storage_path + + +def serialize_payload(payload): + return json.dumps( + payload, + ensure_ascii=True, + sort_keys=True, + separators=(",", ":"), + ) + + +def write_payload(storage_path, payload): + resolved_path = resolve_storage_path(storage_path) + resolved_path.parent.mkdir(parents=True, exist_ok=True) + + payload_json = serialize_payload(payload) + payload_hash = hashlib.sha256(payload_json.encode("utf-8")).hexdigest() + + tmp_path = resolved_path.with_suffix(f"{resolved_path.suffix}.tmp") + tmp_path.write_text(payload_json, encoding="utf-8") + tmp_path.replace(resolved_path) + + return payload_hash + + +def read_payload(storage_path): + resolved_path = resolve_storage_path(storage_path) + return json.loads(resolved_path.read_text(encoding="utf-8")) + + +def delete_payload(storage_path): + resolved_path = resolve_storage_path(storage_path) + if resolved_path.exists(): + resolved_path.unlink() + + +def cleanup_exported_payloads(collections=None, older_than_days=7): + from metrics.models import DailyMetricJob + + root = get_daily_payload_root() + if not root.exists(): + return 0 + + cutoff = timezone.now() - timedelta(days=older_than_days) if older_than_days and older_than_days > 0 else None + + storage_path_to_job = {} + db_queryset = DailyMetricJob.objects.exclude(storage_path="") + if collections: + db_queryset = db_queryset.filter(collection__acron3__in=collections) + for job in db_queryset.iterator(chunk_size=500): + storage_path_to_job[job.storage_path] = job + + json_files = root.rglob("*.json") + if collections: + json_files = [p for p in json_files if p.relative_to(root).parts[0] in collections] + + deleted_count = 0 + for file_path in json_files: + if cutoff and _file_is_recent(file_path, cutoff): + continue + + storage_path = file_path.relative_to(root).as_posix() + job = storage_path_to_job.get(storage_path) + + if job is not None and job.status != DailyMetricJob.STATUS_EXPORTED: + continue + + try: + file_path.unlink() + except FileNotFoundError: + pass + deleted_count += 1 + + if job is not None: + job.storage_path = "" + job.payload_hash = "" + job.save(update_fields=["storage_path", "payload_hash", "updated"]) + + _cleanup_empty_dirs(root) + + logging.info( + "Cleaned up %s daily payload files (collections=%s, older_than_days=%s).", + deleted_count, + collections or "all", + older_than_days, + ) + return deleted_count + + +def _file_is_recent(file_path, cutoff): + return file_path.stat().st_mtime >= cutoff.timestamp() + + +def _cleanup_empty_dirs(root): + for dirpath, dirnames, filenames in os.walk(root, topdown=False): + if dirpath == str(root): + continue + try: + os.rmdir(dirpath) + except OSError: + pass diff --git a/metrics/services/export.py b/metrics/services/export.py new file mode 100644 index 0000000..03efbc6 --- /dev/null +++ b/metrics/services/export.py @@ -0,0 +1,94 @@ +import logging + +from django.conf import settings + +from metrics import opensearch +from metrics.opensearch.names import generate_month_index_name, generate_year_index_name + +from . import daily_payloads + + +def load_daily_metric_payload(job): + if not job.storage_path: + return None + try: + return daily_payloads.read_payload(job.storage_path) + except FileNotFoundError: + logging.warning("Daily metric payload not found for job %s.", job.pk) + return None + + +def export_daily_metric_payload(search_client, job, payload): + if not job.job_id: + raise RuntimeError("Daily metric job has no payload hash.") + + export_documents( + search_client=search_client, + documents=payload.get("documents") or {}, + collection=payload.get("collection") or job.collection.acron3, + job_id=job.job_id, + ) + + +def export_documents(search_client, documents, collection, job_id): + if not documents: + return + + _sync_documents_group( + search_client=search_client, + collection=collection, + documents=documents.get("month", {}), + granularity="month", + job_id=job_id, + ) + _sync_documents_group( + search_client=search_client, + collection=collection, + documents=documents.get("year", {}), + granularity="year", + job_id=job_id, + ) + + +def _sync_documents_group( + search_client, + collection, + documents, + granularity, + job_id, +): + if not documents: + return + + grouped_documents = {} + index_prefix = settings.OPENSEARCH_INDEX_NAME + + for doc_id, document in documents.items(): + if granularity == "month": + index_name = generate_month_index_name( + index_prefix=index_prefix, + collection=collection, + date=f"{document.get('access_month')}-01", + ) + mappings = opensearch.get_index_mappings(collection, "month") + else: + index_name = generate_year_index_name( + index_prefix=index_prefix, + collection=collection, + date=f"{document.get('access_year')}-01-01", + ) + mappings = opensearch.get_index_mappings(collection, "year") + + grouped_documents.setdefault(index_name, {"mappings": mappings, "documents": {}}) + grouped_documents[index_name]["documents"][doc_id] = document + + for index_name, payload in grouped_documents.items(): + search_client.create_index_if_not_exists( + index_name=index_name, + mappings=payload["mappings"], + ) + search_client.increment_documents_for_daily_job( + index_name=index_name, + documents=payload["documents"], + job_id=job_id, + ) diff --git a/metrics/services/jobs.py b/metrics/services/jobs.py new file mode 100644 index 0000000..78f5100 --- /dev/null +++ b/metrics/services/jobs.py @@ -0,0 +1,153 @@ +import logging +from datetime import timedelta + +from django.db import transaction +from django.utils import timezone + +from log_manager import choices +from log_manager.models import LogFile + +from metrics.models import DailyMetricJob + + +def create_or_update_daily_metric_job(collection, access_date, log_files): + input_log_hashes = sorted(log_file.hash for log_file in log_files if log_file.hash) + with transaction.atomic(): + job, _ = DailyMetricJob.objects.select_for_update().get_or_create( + collection=collection, + access_date=access_date, + ) + + if job.status == DailyMetricJob.STATUS_EXPORTED: + if job.input_log_hashes != input_log_hashes: + raise RuntimeError( + f"Daily metric job already exported for {collection.acron3} {access_date}. " + "Recompute requires deleting/recreating the affected day or period first." + ) + LogFile.objects.filter(hash__in=input_log_hashes).update( + status=choices.LOG_FILE_STATUS_PROCESSED, + parse_heartbeat_at=None, + updated=timezone.now(), + ) + return job + + keep_payload = ( + job.status == DailyMetricJob.STATUS_ERROR + and job.input_log_hashes == input_log_hashes + and job.storage_path + and job.payload_hash + ) + + job.input_log_hashes = input_log_hashes + job.status = DailyMetricJob.STATUS_PENDING + job.error_message = "" + job.export_started_at = None + job.exported_at = None + if not keep_payload: + job.storage_path = "" + job.payload_hash = "" + job.summary = {} + job.save( + update_fields=[ + "input_log_hashes", + "status", + "error_message", + "export_started_at", + "exported_at", + "storage_path", + "payload_hash", + "summary", + "updated", + ] + ) + return job + + +def acquire_daily_metric_job(job_id): + with transaction.atomic(): + job = ( + DailyMetricJob.objects.select_for_update() + .select_related("collection") + .get(pk=job_id) + ) + if job.status in { + DailyMetricJob.STATUS_EXPORTING, + DailyMetricJob.STATUS_EXPORTED, + }: + logging.info("Daily metric job %s is already in final/active state.", job_id) + return None + + job.status = DailyMetricJob.STATUS_EXPORTING + job.attempts += 1 + job.error_message = "" + job.export_started_at = timezone.now() + job.save( + update_fields=[ + "status", + "attempts", + "error_message", + "export_started_at", + "updated", + ] + ) + return job + + +def mark_daily_metric_job_failed(job, error_message): + DailyMetricJob.objects.filter(pk=job.pk).update( + status=DailyMetricJob.STATUS_ERROR, + error_message=str(error_message), + updated=timezone.now(), + ) + LogFile.objects.filter(hash__in=job.input_log_hashes).update( + status=choices.LOG_FILE_STATUS_ERROR, + parse_heartbeat_at=None, + updated=timezone.now(), + ) + + +def mark_daily_metric_job_exported(job, user=None): + DailyMetricJob.objects.filter(pk=job.pk).update( + status=DailyMetricJob.STATUS_EXPORTED, + error_message="", + exported_at=timezone.now(), + updated=timezone.now(), + ) + LogFile.objects.filter(hash__in=job.input_log_hashes).update( + status=choices.LOG_FILE_STATUS_PROCESSED, + parse_heartbeat_at=None, + updated=timezone.now(), + ) + + +def release_stale_daily_metric_jobs(collections=None, from_date=None, until_date=None, stale_after_minutes=60): + cutoff = timezone.now() - timedelta(minutes=stale_after_minutes) + queryset = DailyMetricJob.objects.filter( + status=DailyMetricJob.STATUS_EXPORTING, + export_started_at__lt=cutoff, + ) + if collections: + queryset = queryset.filter(collection__acron3__in=collections) + if from_date: + queryset = queryset.filter(access_date__gte=from_date) + if until_date: + queryset = queryset.filter(access_date__lte=until_date) + + stale_jobs = list(queryset.only("pk", "input_log_hashes")) + released = queryset.update( + status=DailyMetricJob.STATUS_ERROR, + error_message="Job marked for retry after stale exporting state.", + updated=timezone.now(), + ) + stale_hashes = { + log_hash + for job in stale_jobs + for log_hash in (job.input_log_hashes or []) + } + if stale_hashes: + LogFile.objects.filter(hash__in=stale_hashes).update( + status=choices.LOG_FILE_STATUS_ERROR, + parse_heartbeat_at=None, + updated=timezone.now(), + ) + return released diff --git a/metrics/services/parser.py b/metrics/services/parser.py new file mode 100644 index 0000000..5eb3dbf --- /dev/null +++ b/metrics/services/parser.py @@ -0,0 +1,249 @@ +import logging +from datetime import timedelta +from time import monotonic + +from django.conf import settings +from django.utils import timezone + +from scielo_usage_counter import log_handler, url_translator + +from log_manager import choices +from log_manager.models import LogFile +from log_manager_config.models import CollectionLogDirectory +from source.models import Source +from document.models import Document +from tracker.choices import ( + LOG_FILE_DISCARDED_LINE_REASON_MISSING_DOCUMENT, + LOG_FILE_DISCARDED_LINE_REASON_MISSING_SOURCE, +) +from tracker.models import LogFileDiscardedLine + +from metrics.counter import access, documents as index_docs +from metrics.counter import parser + +from .resources import get_log_files_for_collection_date +from . import daily_payloads + + +def process_daily_metric_job(job, robots_list, mmdb, track_errors=False): + log_files = get_log_files_for_collection_date( + collection=job.collection, + access_date=job.access_date, + ) + if not log_files: + raise RuntimeError(f"No log files found for {job.collection.acron3} {job.access_date}.") + + results = {} + summary = { + "log_files": len(log_files), + "input_log_hashes": sorted(log_file.hash for log_file in log_files if log_file.hash), + "lines_parsed": 0, + "valid_lines": 0, + "discarded_lines": 0, + } + + LogFile.objects.filter(pk__in=[log_file.pk for log_file in log_files]).update( + status=choices.LOG_FILE_STATUS_PARSING, + summary={}, + last_processed_line=0, + parse_heartbeat_at=timezone.now(), + updated=timezone.now(), + ) + LogFileDiscardedLine.objects.filter(log_file_id__in=[log_file.pk for log_file in log_files]).delete() + + heartbeat_interval_seconds = getattr(settings, "METRICS_PARSE_HEARTBEAT_INTERVAL_SECONDS", 30) + + for log_file in log_files: + log_parser, url_translator_manager = setup_parsing_environment( + log_file=log_file, + robots_list=robots_list, + mmdb=mmdb, + ) + line_count = 0 + valid_count = 0 + errors = [] + last_heartbeat_monotonic = monotonic() + + for line in log_parser.parse(): + line_count += 1 + if monotonic() - last_heartbeat_monotonic >= heartbeat_interval_seconds: + touch_parse_heartbeat(log_file, log_parser.stats.lines_parsed) + last_heartbeat_monotonic = monotonic() + + is_valid_line, error_obj = process_line( + results=results, + line=line, + utm=url_translator_manager, + log_file=log_file, + track_errors=track_errors, + ) + if is_valid_line: + valid_count += 1 + else: + summary["discarded_lines"] += 1 + if error_obj: + errors.append(error_obj) + + if errors: + LogFileDiscardedLine.objects.bulk_create(errors) + + summary["lines_parsed"] += line_count + summary["valid_lines"] += valid_count + log_file.summary = { + "parsing_completed": True, + "lines_parsed": line_count, + "valid_lines": valid_count, + } + log_file.last_processed_line = log_parser.stats.lines_parsed + log_file.parse_heartbeat_at = timezone.now() + log_file.save( + update_fields=[ + "summary", + "last_processed_line", + "parse_heartbeat_at", + "updated", + ] + ) + + documents = index_docs.convert_raw_results_to_index_documents(results) + storage_path = daily_payloads.build_daily_storage_path(job.collection, job.access_date) + payload = { + "collection": job.collection.acron3, + "access_date": job.access_date.isoformat(), + "input_log_hashes": summary["input_log_hashes"], + "documents": documents, + "summary": summary, + } + payload_hash = daily_payloads.write_payload(storage_path, payload) + + job.input_log_hashes = summary["input_log_hashes"] + job.storage_path = storage_path.as_posix() + job.payload_hash = payload_hash + job.summary = { + **summary, + "month_document_count": len(documents.get("month", {})), + "year_document_count": len(documents.get("year", {})), + } + job.save( + update_fields=[ + "input_log_hashes", + "storage_path", + "payload_hash", + "summary", + "updated", + ] + ) + + return payload + + +def setup_parsing_environment(log_file, robots_list, mmdb): + lp = log_handler.LogParser(mmdb_data=mmdb.data, robots_list=robots_list, output_mode="dict") + lp.logfile = log_file.path + + translator_class = None + for cld in CollectionLogDirectory.objects.filter(config__collection=log_file.collection): + if cld.path in log_file.path: + if cld.translator_class: + translator_class = parser.translator_class_name_to_obj(cld.translator_class) + break + + if not translator_class: + raise Exception(f"No URL translator class found for collection {log_file.collection}.") + + utm = url_translator.URLTranslationManager( + documents_metadata=Document.metadata(collection=log_file.collection), + sources_metadata=Source.metadata(collection=log_file.collection), + translator=translator_class, + ) + return lp, utm + + +def process_line(results, line, utm, log_file, track_errors=False): + try: + translated_url = utm.translate(line.get("url")) + except Exception as exc: + logging.error("Error translating URL %s: %s", line.get("url"), exc) + return False, None + + try: + item_access_data = access.extract_item_access_data(log_file.collection.acron3, translated_url) + except Exception as exc: + logging.error("Error extracting item access data from URL %s: %s", line.get("url"), exc) + return False, None + + ignore_utm_validation = not track_errors + is_valid, check_result = access.is_valid_item_access_data( + item_access_data, + utm, + ignore_utm_validation, + ) + + if not is_valid: + if track_errors: + error_code = check_result.get("code") + if error_code in { + "invalid_scielo_issn", + "invalid_source_id", + "invalid_pid_v3", + "invalid_pid_v2", + "invalid_pid_generic", + }: + tracker_error_type = ( + LOG_FILE_DISCARDED_LINE_REASON_MISSING_DOCUMENT + if "pid" in error_code + else LOG_FILE_DISCARDED_LINE_REASON_MISSING_SOURCE + ) + + return False, LogFileDiscardedLine.create( + log_file=log_file, + error_type=tracker_error_type, + message=check_result.get("message"), + data={"line": line, "item_access_data": item_access_data}, + save=False, + ) + + return False, None + + try: + access.update_results_with_item_access_data(results, item_access_data, line) + except Exception as exc: + logging.error("Error updating metrics results for URL %s: %s", line.get("url"), exc) + return False, None + + return True, None + + +def touch_parse_heartbeat(log_file, last_processed_line=None): + heartbeat_at = timezone.now() + update_kwargs = { + "parse_heartbeat_at": heartbeat_at, + "updated": heartbeat_at, + } + if last_processed_line is not None: + update_kwargs["last_processed_line"] = last_processed_line or 0 + log_file.last_processed_line = last_processed_line or 0 + LogFile.objects.filter(pk=log_file.pk).update(**update_kwargs) + log_file.parse_heartbeat_at = heartbeat_at + + +def is_stale_parsing_log(log_file, stale_after_minutes=60): + if log_file.status != choices.LOG_FILE_STATUS_PARSING: + return False + + if not log_file.parse_heartbeat_at: + return True + + cutoff = timezone.now() - timedelta(minutes=stale_after_minutes) + return log_file.parse_heartbeat_at < cutoff + + +def requeue_stale_parsing_log(log_file): + now = timezone.now() + LogFile.objects.filter(pk=log_file.pk).update( + status=choices.LOG_FILE_STATUS_ERROR, + parse_heartbeat_at=None, + updated=now, + ) + log_file.status = choices.LOG_FILE_STATUS_ERROR + log_file.parse_heartbeat_at = None diff --git a/metrics/services/resources.py b/metrics/services/resources.py new file mode 100644 index 0000000..dc31400 --- /dev/null +++ b/metrics/services/resources.py @@ -0,0 +1,54 @@ +import logging + +from django.conf import settings + +from log_manager.models import LogFile +from resources.models import MMDB, RobotUserAgent + +from metrics import opensearch + + +def extract_celery_queue_name(collection_acronym): + return f"parse_{settings.COLLECTION_ACRON3_SIZE_MAP.get(collection_acronym, 'small')}" + + +def fetch_required_resources(robot_source=None): + robots_list = RobotUserAgent.get_patterns(source=robot_source) + if not robots_list: + logging.error( + "There are no robots available in the database for source %s.", + RobotUserAgent.normalize_source(robot_source), + ) + return None, None + + try: + mmdb = MMDB.objects.latest("created") + except MMDB.DoesNotExist: + logging.error("There are no MMDB files available in the database.") + return None, None + + return robots_list, mmdb + + +def build_search_client(): + return opensearch.OpenSearchUsageClient( + settings.OPENSEARCH_URL, + settings.OPENSEARCH_BASIC_AUTH, + settings.OPENSEARCH_API_KEY, + settings.OPENSEARCH_VERIFY_CERTS, + ) + + +def get_log_files_for_collection_date(collection, access_date, status_filters=None): + queryset = ( + LogFile.objects.filter( + collection=collection, + date=access_date, + ) + .select_related("collection") + .order_by("path", "hash") + ) + if status_filters: + queryset = queryset.filter(status__in=status_filters) + + return list(queryset) diff --git a/metrics/tasks.py b/metrics/tasks.py deleted file mode 100644 index 026bfb5..0000000 --- a/metrics/tasks.py +++ /dev/null @@ -1,508 +0,0 @@ -import logging - -from django.conf import settings -from django.contrib.auth import get_user_model -from django.utils.translation import gettext as _ - -from scielo_usage_counter import log_handler -from scielo_usage_counter import url_translator - -from config import celery_app -from core.utils.utils import _get_user -from core.utils.date_utils import get_date_obj, get_date_range_str -from article.models import Article -from collection.models import Collection -from journal.models import Journal -from log_manager import choices -from log_manager_config.models import CollectionURLTranslatorClass, CollectionLogFilesPerDay, CollectionLogDirectory -from log_manager.models import LogFile, CollectionLogFileDateCount, LogFileDate -from resources.models import MMDB, RobotUserAgent -from tracker.models import LogFileDiscardedLine -from tracker.choices import LOG_FILE_DISCARDED_LINE_REASON_MISSING_ARTICLE, LOG_FILE_DISCARDED_LINE_REASON_MISSING_JOURNAL - -from . import es -from .utils import parser_utils, index_utils - - -User = get_user_model() - - -def extract_celery_queue_name(collection_acronym): - return f"parse_{settings.COLLECTION_ACRON3_SIZE_MAP.get(collection_acronym, 'small')}" - - -@celery_app.task(bind=True, name=_('Parse logs'), timelimit=-1) -def task_parse_logs(self, collections=[], include_logs_with_error=True, batch_size=5000, replace=False, track_errors=False, from_date=None, until_date=None, days_to_go_back=None, user_id=None, username=None): - """ - Parses log files associated with a given collection. - - Args: - collections (list, optional): List of collection acronyms to parse logs for. Defaults to all collections. - include_logs_with_error (bool, optional): Whether to include logs with errors. Defaults to True. - batch_size (int, optional): Number of records to process in a single batch. Defaults to 5000. - replace (bool, optional): Whether to replace existing records. Defaults to False. - track_errors (bool, optional): Whether to track errors in log parsing. Defaults to False. - from_date (str, optional): Start date for log parsing in 'YYYY-MM-DD' format. Defaults to None. - until_date (str, optional): End date for log parsing in 'YYYY-MM-DD' format. Defaults to None. - days_to_go_back (int, optional): Number of days to go back from the current date to parse logs. Defaults to None. - user_id - username - - Returns: - None. - """ - from_date, until_date = get_date_range_str(from_date, until_date, days_to_go_back) - - from_date_obj = get_date_obj(from_date) - until_date_obj = get_date_obj(until_date) - - # Set status filters based on the include_logs_with_error and replace flags - status_filters = [choices.LOG_FILE_STATUS_QUEUED] - if include_logs_with_error: - status_filters.append(choices.LOG_FILE_STATUS_ERROR) - if replace: - status_filters.append(choices.LOG_FILE_STATUS_PROCESSED) - - for collection in collections or Collection.acron3_list(): - for lf in LogFile.objects.filter(status__in=status_filters, collection__acron3=collection): - probably_date = parser_utils.extract_date_from_validation_dict(lf.validation) - if not probably_date: - logging.debug(f'Log file {lf.path} does not have a valid probably date.') - continue - - if probably_date < from_date_obj or probably_date > until_date_obj: - continue - - queue_name = extract_celery_queue_name(collection) - - logging.info(f'PARSING file {lf.path}') - task_parse_log.apply_async( - args=(lf.hash, batch_size, replace, track_errors, user_id, username), - queue=queue_name, - ) - - -@celery_app.task(bind=True, name=_('Parse one log'), timelimit=-1) -def task_parse_log(self, log_file_hash, batch_size=5000, replace=False, track_errors=False, user_id=None, username=None): - """ - Parses a log file, extracts relevant information, and creates processed log records in the database. - - Args: - log_file_hash (str): Hash representing the log file to be parsed. - batch_size (int, optional): Number of records to process in a single batch. Defaults to 5000. - replace (bool, optional): Whether to replace existing records. Defaults to False. - track_errors (bool, optional): Whether to track errors in log parsing. Defaults to False. - user_id - username - - Returns: - None. - """ - user = _get_user(self.request, username=username, user_id=user_id) - robots_list, mmdb = _fetch_required_resources() - if not robots_list or not mmdb: - return - - log_file = _initialize_log_file(log_file_hash) - if not log_file: - return - - clfdc = create_or_update_collection_log_file_date_count( - user=user, - collection=log_file.collection, - date=get_date_obj(log_file.validation.get('probably_date')) - ) - - if not replace and clfdc.is_usage_metric_computed: - logging.info(f'Usage metric already computed for {log_file.validation.get("probably_date")}') - return - - if replace: - clfdc.exported_files_count = 0 - clfdc.is_usage_metric_computed = False - clfdc.save() - - log_parser, url_translator_manager = _setup_parsing_environment(log_file, robots_list, mmdb) - success = _process_lines(lp=log_parser, utm=url_translator_manager, log_file=log_file, batch_size=batch_size, replace=replace, track_errors=track_errors) - - if not success: - logging.error(f'Failed to parse log file {log_file.path}.') - log_file.status = choices.LOG_FILE_STATUS_ERROR - log_file.save() - return - - log_file.status = choices.LOG_FILE_STATUS_PROCESSED - log_file.save() - - _update_exported_files_count(clfdc) - - logging.info(f'Log file {log_file.path} has been successfully parsed.') - - -def create_or_update_collection_log_file_date_count(user, collection, date): - n_expected_files = CollectionLogFilesPerDay.get_number_of_expected_files_by_day(collection=collection.acron3, date=date) - n_found_logs = LogFileDate.get_number_of_found_files_for_date(collection=collection.acron3, date=date) - - return CollectionLogFileDateCount.create_or_update( - user=user, - collection=collection, - date=date, - expected_log_files=n_expected_files, - found_log_files=n_found_logs, - ) - - -def _initialize_log_file(log_file_hash): - """ - Initializes the log file for parsing by setting its status to 'parsing'. - - Args: - log_file_hash (str): The hash of the log file to be initialized. - - Returns: - LogFile: The initialized log file object, or None if it does not exist. - """ - try: - log_file = LogFile.get(hash=log_file_hash) - log_file.status = choices.LOG_FILE_STATUS_PARSING - log_file.save() - return log_file - except LogFile.DoesNotExist: - logging.error(f'Log file with hash {log_file_hash} does not exist.') - return None - - -def _fetch_required_resources(): - """ - Fetches the necessary resources for parsing logs, including robot user agents and MMDB files. - - Returns: - tuple: A tuple containing the list of robot user agents and the latest MMDB object. - """ - robots_list = RobotUserAgent.get_all_patterns() - if not robots_list: - logging.error('There are no robots available in the database.') - return None, None - - mmdb = MMDB.objects.latest('created') - if not mmdb: - logging.error('There are no MMDB files available in the database.') - return None, None - - return robots_list, mmdb - - -def _setup_parsing_environment(log_file, robots_list, mmdb): - """ - Sets up the environment for parsing the log file, including initializing the log parser and URL translator manager. - - Args: - log_file (LogFile): The log file to be parsed. - robots_list (list): List of robot user agents. - mmdb (MMDB): The MMDB object containing geolocation data. - - Returns: - tuple: A tuple containing the LogParser instance and URLTranslationManager instance. - """ - lp = log_handler.LogParser(mmdb_data=mmdb.data, robots_list=robots_list, output_mode='dict') - lp.logfile = log_file.path - - translator_class = None - for cld in CollectionLogDirectory.objects.filter(collection=log_file.collection): - if cld.path in log_file.path: - try: - translator_class_name = CollectionURLTranslatorClass.objects.get(collection=log_file.collection, directory=cld).translator_class - translator_class = parser_utils.translator_class_name_to_obj(translator_class_name) - break - except CollectionURLTranslatorClass.DoesNotExist: - continue - - if not translator_class: - raise Exception(f'No URL translator class found for collection {log_file.collection}.') - - logging.info(f'Creating URL translation manager for {log_file.collection}') - utm = url_translator.URLTranslationManager( - articles_metadata=Article.metadata(collection=log_file.collection), - journals_metadata=Journal.metadata(collection=log_file.collection), - translator=translator_class, - ) - return lp, utm - - -def _process_lines(lp, utm, log_file, batch_size=5000, replace=False, track_errors=False): - """ - Processes each line of the log file, translating URLs and registering item accesses. - - Args: - lp (LogParser): The log parser instance. - utm (URLTranslationManager): The URL translation manager instance. - log_file (LogFile): The log file being processed. - batch_size (int, optional): Number of records to process in a single batch. Defaults to 5000. - replace (bool, optional): Whether to replace existing records. Defaults to False. - track_errors (bool, optional): Whether to track errors in log parsing. Defaults to False. - - Returns: - None. - """ - logging.info(f'Processing {lp.logfile}') - results = {} - errors = [] - - jump = log_file.last_processed_line if not replace else 0 - - es_manager = es.ElasticSearchUsageWrapper( - settings.ES_URL, - settings.ES_BASIC_AUTH, - settings.ES_API_KEY, - settings.ES_VERIFY_CERTS - ) - - if not es_manager.ping(): - logging.error('Elasticsearch client is not available.') - return False - - index_name = index_utils.generate_index_name( - index_prefix=settings.ES_INDEX_NAME, - collection=log_file.collection.acron3, - date=log_file.validation.get('probably_date') - ) - - es_manager.create_index_if_not_exists(index_name=index_name) - - if replace: - logging.info(f'Removing existing documents for collection {log_file.collection.acron3} and date {log_file.validation.get("probably_date")}') - delete_success = es_manager.delete_documents_by_key( - index_name=index_name, - data={'collection': log_file.collection.acron3, 'date': log_file.validation.get('probably_date')}, - ) - if not delete_success: - logging.error(f'Failed to delete existing documents for collection {log_file.collection.acron3} and date {log_file.validation.get("probably_date")}') - return False - - for line in lp.parse(): - if lp.stats.lines_parsed < jump: - continue - - if lp.stats.lines_parsed % batch_size == 0: - logging.info(f'Processing line {lp.stats.lines_parsed} of {lp.logfile}') - - is_valid_line, error_obj = _process_line(results, line, utm, log_file, track_errors) - if not is_valid_line: - if error_obj: - errors.append(error_obj) - - if len(errors) >= batch_size: - LogFileDiscardedLine.objects.bulk_create(errors) - errors = [] - continue - - if len(results) >= batch_size: - logging.info(f'Indexing data for log file {log_file.path}') - es_manager.export_to_index( - index_name=index_name, - data=results, - batch_size=batch_size - ) - results = {} - - _update_log_file_summary(log_file, lp.stats.get_stats()) - - logging.info(f'Indexing data for log file {log_file.path}') - es_manager.export_to_index( - index_name=index_name, - data=results, - batch_size=batch_size - ) - results = {} - - LogFileDiscardedLine.objects.bulk_create(errors) if errors else None - errors = [] - - _update_log_file_summary(log_file, lp.stats.get_stats()) - - return True - - -def _update_log_file_summary(log_file, stats): - if not stats: - logging.warning(f'No stats available for log file {log_file.path}. Skipping summary update.') - return - - summary_k, summary_v = stats - log_file.summary = dict(zip(summary_k, summary_v)) - log_file.last_processed_line = log_file.summary.get('lines_parsed', 0) - log_file.save() - - -def _update_exported_files_count(collection_log_file_date: CollectionLogFileDateCount): - collection_log_file_date.exported_files_count += 1 - collection_log_file_date.set_is_usage_metric_computed() - collection_log_file_date.save() - - -def _process_line(results, line, utm, log_file, track_errors=False): - """ - Process a single log line to extract and validate item access data. - This function translates a URL from the log line, extracts item access data, - validates the data, and updates the results if the data is valid. - - Args: - results: Dictionary or data structure to store processed results - line (dict): Log line containing URL and other access information - utm: URL translation manager for converting URLs - log_file: Log file object containing collection information (must have collection.acron3) - track_errors (bool): Whether to track errors in log parsing. - - Returns: - tuple: A tuple containing a boolean indicating success or failure, and an optional LogFileDiscardedLine object. - - Raises: - Logs errors for URL translation failures and item access data extraction failures. - Logs debug messages for invalid item access data. - """ - try: - translated_url = utm.translate(line.get('url')) - except Exception as e: - logging.error(f'Error translating URL {line.get("url")}: {e}') - return False, None - - try: - item_access_data = index_utils.extract_item_access_data(log_file.collection.acron3, translated_url) - except Exception as e: - logging.error(f'Error extracting item access data from URL {line.get("url")}: {e}') - return False, None - - ignore_utm_validation = not track_errors - is_valid, check_result = index_utils.is_valid_item_access_data(item_access_data, utm, ignore_utm_validation) - - if not is_valid: - if track_errors: - error_code = check_result.get('code') - - if error_code in { - 'invalid_scielo_issn', - 'invalid_pid_v3', - 'invalid_pid_v2', - 'invalid_pid_generic' - }: - if 'pid' in error_code: - tracker_error_type = LOG_FILE_DISCARDED_LINE_REASON_MISSING_ARTICLE - else: - tracker_error_type = LOG_FILE_DISCARDED_LINE_REASON_MISSING_JOURNAL - - lfdl = LogFileDiscardedLine.create( - log_file=log_file, - error_type=tracker_error_type, - message=check_result.get('message'), - data={'line': line, 'item_access_data': item_access_data}, - save=False, - ) - logging.debug(f'Invalid item access data: {check_result.get("message")}. Line: {line}. Item Access Data: {item_access_data}') - return False, lfdl - - return False, None - - index_utils.update_results_with_item_access_data( - results, - item_access_data, - line - ) - - return True, None - - -@celery_app.task(bind=True, name=_('Create index'), timelimit=-1) -def task_create_index(self, index_name, mappings=None, user_id=None, username=None): - """ - Creates an Elasticsearch index with the specified settings and mappings. - - Args: - index_name (str): The name of the index to be created. - mappings (dict, optional): The mappings for the index. Defaults to None. - user_id (int, optional): The ID of the user initiating the task. Defaults to None. - username (str, optional): The username of the user initiating the task. Defaults to None. - - Returns: - None. - """ - user = _get_user(self.request, username=username, user_id=user_id) - es_manager = es.ElasticSearchUsageWrapper( - settings.ES_URL, - settings.ES_BASIC_AUTH, - settings.ES_API_KEY, - settings.ES_VERIFY_CERTS - ) - - try: - if es_manager.client.indices.exists(index=index_name): - logging.info(f"Index {index_name} already exists.") - return - - es_manager.create_index(index_name=index_name, mappings=mappings) - logging.info(f"Index {index_name} created successfully.") - except Exception as e: - logging.error(f"Failed to create index {index_name}: {e}") - - -@celery_app.task(bind=True, name=_('Delete index'), timelimit=-1) -def task_delete_index(self, index_name, user_id=None, username=None): - """ - Deletes an Elasticsearch index. - - Args: - index_name (str): The name of the index to be deleted. - user_id (int, optional): The ID of the user initiating the task. Defaults to None. - username (str, optional): The username of the user initiating the task. Defaults to None. - - Returns: - None. - """ - user = _get_user(self.request, username=username, user_id=user_id) - es_manager = es.ElasticSearchUsageWrapper( - settings.ES_URL, - settings.ES_BASIC_AUTH, - settings.ES_API_KEY, - settings.ES_VERIFY_CERTS - ) - - try: - if not es_manager.client.indices.exists(index=index_name): - logging.info(f"Index {index_name} does not exist.") - return - - es_manager.client.indices.delete(index=index_name) - logging.info(f"Index {index_name} deleted successfully.") - except Exception as e: - logging.error(f"Failed to delete index {index_name}: {e}") - - -@celery_app.task(bind=True, name=_('Delete documents by key'), timelimit=-1) -def task_delete_documents_by_key(self, index_name, data, user_id=None, username=None): - """ - Deletes documents from Elasticsearch based on the provided keys and values. - - Args: - index_name (str): The name of the Elasticsearch index. Defaults to settings.ES_INDEX_NAME. - data (dict): A dictionary where keys are field names and values are the corresponding values to match for deletion. - user_id (int, optional): The ID of the user initiating the task. Defaults to None. - username (str, optional): The username of the user initiating the task. Defaults to None. - - Returns: - None. - """ - user = _get_user(self.request, username=username, user_id=user_id) - es_manager = es.ElasticSearchUsageWrapper( - settings.ES_URL, - settings.ES_BASIC_AUTH, - settings.ES_API_KEY, - settings.ES_VERIFY_CERTS - ) - - try: - es_manager.delete_documents_by_key( - index_name=index_name, - data=data, - ) - logging.info(f"Successfully deleted documents with data: {data} from index {index_name}.") - except Exception as e: - logging.error(f"Failed to delete documents with data {data} from index {index_name}: {e}") diff --git a/metrics/tasks/__init__.py b/metrics/tasks/__init__.py new file mode 100644 index 0000000..f0c2d6a --- /dev/null +++ b/metrics/tasks/__init__.py @@ -0,0 +1,19 @@ +from .parse import ( + task_parse_logs, + task_wait_parse_logs_wave, +) +from .process import ( + task_process_daily_metric_job, +) +from .resume import ( + task_resume_log_exports, + task_resume_stale_parsing_logs, +) +from .index import ( + task_create_index, + task_delete_index, + task_delete_documents_by_key, +) +from .cleanup import ( + task_cleanup_daily_payloads, +) diff --git a/metrics/tasks/cleanup.py b/metrics/tasks/cleanup.py new file mode 100644 index 0000000..9b3c8e0 --- /dev/null +++ b/metrics/tasks/cleanup.py @@ -0,0 +1,31 @@ +import logging + +from django.utils.translation import gettext as _ + +from config import celery_app +from core.utils.request_utils import _get_user +from metrics.services import daily_payloads + + +@celery_app.task(bind=True, name=_("[Metrics] Cleanup Daily Payloads"), timelimit=-1) +def task_cleanup_daily_payloads( + self, + collections=None, + older_than_days=7, + user_id=None, + username=None, +): + _get_user(self.request, username=username, user_id=user_id) + + deleted_count = daily_payloads.cleanup_exported_payloads( + collections=collections or [], + older_than_days=older_than_days, + ) + + logging.info( + "Cleanup task completed: %s payload file(s) deleted (collections=%s, older_than_days=%s).", + deleted_count, + collections or "all", + older_than_days, + ) + return {"deleted_payloads": deleted_count} diff --git a/metrics/tasks/index.py b/metrics/tasks/index.py new file mode 100644 index 0000000..2635377 --- /dev/null +++ b/metrics/tasks/index.py @@ -0,0 +1,61 @@ +import logging + +from django.utils.translation import gettext as _ + +from config import celery_app +from core.utils.request_utils import _get_user + +from metrics.services.resources import build_search_client + + +@celery_app.task(bind=True, name=_("[Metrics] Create Index"), timelimit=-1) +def task_create_index(self, index_name, mappings=None, user_id=None, username=None): + _get_user(self.request, username=username, user_id=user_id) + search_client = build_search_client() + + try: + if search_client.client.indices.exists(index=index_name): + logging.info("Index %s already exists.", index_name) + return + + search_client.create_index(index_name=index_name, mappings=mappings or {}) + logging.info("Index %s created successfully.", index_name) + except Exception as exc: + logging.error("Failed to create index %s: %s", index_name, exc) + + +@celery_app.task(bind=True, name=_("[Metrics] Delete Index"), timelimit=-1) +def task_delete_index(self, index_name, user_id=None, username=None): + _get_user(self.request, username=username, user_id=user_id) + search_client = build_search_client() + + try: + if not search_client.client.indices.exists(index=index_name): + logging.info("Index %s does not exist.", index_name) + return + + search_client.delete_index(index_name=index_name) + logging.info("Index %s deleted successfully.", index_name) + except Exception as exc: + logging.error("Failed to delete index %s: %s", index_name, exc) + + +@celery_app.task(bind=True, name=_("[Metrics] Delete Documents by Key"), timelimit=-1) +def task_delete_documents_by_key(self, index_name, data, user_id=None, username=None): + _get_user(self.request, username=username, user_id=user_id) + search_client = build_search_client() + + try: + search_client.delete_documents_by_key(index_name=index_name, data=data) + logging.info( + "Successfully deleted documents with data: %s from index %s.", + data, + index_name, + ) + except Exception as exc: + logging.error( + "Failed to delete documents with data %s from index %s: %s", + data, + index_name, + exc, + ) diff --git a/metrics/tasks/parse.py b/metrics/tasks/parse.py new file mode 100644 index 0000000..7748922 --- /dev/null +++ b/metrics/tasks/parse.py @@ -0,0 +1,286 @@ +import logging + +from django.utils.translation import gettext as _ + +from config import celery_app +from core.utils.date_utils import get_date_obj, get_date_range_str +from core.utils.request_utils import _get_user +from collection.models import Collection +from log_manager import choices +from log_manager.models import LogFile +from metrics.models import DailyMetricJob + +from metrics.services.resources import extract_celery_queue_name, get_log_files_for_collection_date +from metrics.services.jobs import create_or_update_daily_metric_job +from metrics.tasks.process import task_process_daily_metric_job + +AUTO_REEXECUTE_POLL_INTERVAL_SECONDS = 30 + + +@celery_app.task(bind=True, name=_("[Log Pipeline] 3. Parse Logs (Manual)"), timelimit=-1) +def task_parse_logs( + self, + collections=None, + include_logs_with_error=True, + batch_size=5000, + max_log_files=None, + auto_reexecute=False, + replace=False, + track_errors=False, + from_date=None, + until_date=None, + days_to_go_back=None, + queue_name=None, + user_id=None, + username=None, + skip_log_hashes=None, + robots_source=None, +): + if replace: + raise ValueError( + "replace=True is not supported. Recompute requires deleting/recreating " + "the affected day or period first." + ) + + from_date, until_date = get_date_range_str(from_date, until_date, days_to_go_back) + from_date_obj = get_date_obj(from_date) + until_date_obj = get_date_obj(until_date) + enqueued_jobs = 0 + reached_max_log_files = False + enqueued_wave_job_ids = [] + claimed_status_filters = list(_build_parse_status_filters(include_logs_with_error)) + skip_log_hashes = set(skip_log_hashes or []) + + for collection in collections or Collection.acron3_list(): + collection_obj = Collection.objects.filter(acron3=collection).first() + if not collection_obj: + continue + + access_dates = _find_access_dates( + collection=collection_obj, + from_date=from_date, + until_date=until_date, + from_date_obj=from_date_obj, + until_date_obj=until_date_obj, + status_filters=claimed_status_filters, + skip_log_hashes=skip_log_hashes, + ) + + for access_date in access_dates: + log_files = get_log_files_for_collection_date( + collection=collection_obj, + access_date=access_date, + status_filters=claimed_status_filters, + ) + log_files = [log_file for log_file in log_files if log_file.hash not in skip_log_hashes] + if not log_files: + continue + + job = create_or_update_daily_metric_job( + collection=collection_obj, + access_date=access_date, + log_files=log_files, + ) + if job.status == DailyMetricJob.STATUS_EXPORTED: + continue + + task_process_daily_metric_job.apply_async( + args=(job.pk, track_errors, user_id, username, robots_source), + queue=queue_name or extract_celery_queue_name(collection), + ) + enqueued_wave_job_ids.append(job.pk) + enqueued_jobs += 1 + if max_log_files and enqueued_jobs >= max_log_files: + reached_max_log_files = True + break + + if reached_max_log_files: + break + + auto_reexecution_enqueued = _schedule_parse_logs_reexecution( + should_reexecute=auto_reexecute and reached_max_log_files and bool(enqueued_wave_job_ids), + wave_job_ids=enqueued_wave_job_ids, + collections=collections, + include_logs_with_error=include_logs_with_error, + batch_size=batch_size, + max_log_files=max_log_files, + auto_reexecute=auto_reexecute, + replace=replace, + track_errors=track_errors, + from_date=from_date, + until_date=until_date, + days_to_go_back=days_to_go_back, + queue_name=queue_name, + user_id=user_id, + username=username, + skip_log_hashes=sorted(skip_log_hashes), + robots_source=robots_source, + ) + + return { + "enqueued_logs": enqueued_jobs, + "enqueued_jobs": enqueued_jobs, + "reached_max_log_files": reached_max_log_files, + "auto_reexecution_enqueued": auto_reexecution_enqueued, + } + + +def _build_parse_status_filters(include_logs_with_error): + status_filters = [choices.LOG_FILE_STATUS_QUEUED] + if include_logs_with_error: + status_filters.append(choices.LOG_FILE_STATUS_ERROR) + return tuple(status_filters) + + +def _find_access_dates( + collection, + from_date, + until_date, + from_date_obj, + until_date_obj, + status_filters, + skip_log_hashes, +): + date_queryset = ( + LogFile.objects.filter( + status__in=status_filters, + collection=collection, + date__gte=from_date_obj, + date__lte=until_date_obj, + ) + .exclude(hash__in=skip_log_hashes) + .values_list("date", flat=True) + .distinct() + .order_by("date") + ) + + access_dates = set() + for value in list(date_queryset): + access_date = value if hasattr(value, "isoformat") else get_date_obj(value) + if access_date and from_date_obj <= access_date <= until_date_obj: + access_dates.add(access_date) + return sorted(access_dates) + + +def _schedule_parse_logs_reexecution( + should_reexecute, + wave_job_ids, + collections, + include_logs_with_error, + batch_size, + max_log_files, + auto_reexecute, + replace, + track_errors, + from_date, + until_date, + days_to_go_back, + queue_name, + user_id, + username, + skip_log_hashes, + robots_source=None, +): + if not should_reexecute: + return False + + kwargs = { + "wave_job_ids": wave_job_ids, + "collections": collections, + "include_logs_with_error": include_logs_with_error, + "batch_size": batch_size, + "max_log_files": max_log_files, + "auto_reexecute": auto_reexecute, + "replace": replace, + "track_errors": track_errors, + "from_date": from_date, + "until_date": until_date, + "days_to_go_back": days_to_go_back, + "queue_name": queue_name, + "user_id": user_id, + "username": username, + "skip_log_hashes": skip_log_hashes, + "poll_interval_seconds": AUTO_REEXECUTE_POLL_INTERVAL_SECONDS, + } + if robots_source is not None: + kwargs["robots_source"] = robots_source + + task_wait_parse_logs_wave.apply_async(kwargs=kwargs) + return True + + +@celery_app.task(bind=True, name=_("[Metrics] Wait Parse Logs Wave"), timelimit=-1) +def task_wait_parse_logs_wave( + self, + wave_job_ids=None, + collections=None, + include_logs_with_error=True, + batch_size=5000, + max_log_files=None, + auto_reexecute=False, + replace=False, + track_errors=False, + from_date=None, + until_date=None, + days_to_go_back=None, + queue_name=None, + user_id=None, + username=None, + skip_log_hashes=None, + poll_interval_seconds=AUTO_REEXECUTE_POLL_INTERVAL_SECONDS, + robots_source=None, + wave_log_hashes=None, +): + wave_job_ids = wave_job_ids or wave_log_hashes or [] + if DailyMetricJob.objects.filter( + pk__in=wave_job_ids, + status__in=[DailyMetricJob.STATUS_PENDING, DailyMetricJob.STATUS_EXPORTING], + ).exists(): + kwargs = { + "wave_job_ids": wave_job_ids, + "collections": collections, + "include_logs_with_error": include_logs_with_error, + "batch_size": batch_size, + "max_log_files": max_log_files, + "auto_reexecute": auto_reexecute, + "replace": replace, + "track_errors": track_errors, + "from_date": from_date, + "until_date": until_date, + "days_to_go_back": days_to_go_back, + "queue_name": queue_name, + "user_id": user_id, + "username": username, + "skip_log_hashes": skip_log_hashes, + "poll_interval_seconds": poll_interval_seconds, + } + if robots_source is not None: + kwargs["robots_source"] = robots_source + + task_wait_parse_logs_wave.apply_async( + kwargs=kwargs, + countdown=poll_interval_seconds, + ) + return {"wave_completed": False, "reexecution_enqueued": False} + + kwargs = { + "collections": collections, + "include_logs_with_error": include_logs_with_error, + "batch_size": batch_size, + "max_log_files": max_log_files, + "auto_reexecute": auto_reexecute, + "replace": replace, + "track_errors": track_errors, + "from_date": from_date, + "until_date": until_date, + "days_to_go_back": days_to_go_back, + "queue_name": queue_name, + "user_id": user_id, + "username": username, + "skip_log_hashes": skip_log_hashes, + } + if robots_source is not None: + kwargs["robots_source"] = robots_source + + task_parse_logs.apply_async(kwargs=kwargs) + return {"wave_completed": True, "reexecution_enqueued": True} diff --git a/metrics/tasks/process.py b/metrics/tasks/process.py new file mode 100644 index 0000000..ecdc7a5 --- /dev/null +++ b/metrics/tasks/process.py @@ -0,0 +1,63 @@ +import logging + +from django.utils.translation import gettext as _ + +from config import celery_app +from core.utils.request_utils import _get_user +from metrics.models import DailyMetricJob + +from metrics.services.jobs import acquire_daily_metric_job, mark_daily_metric_job_exported, mark_daily_metric_job_failed +from metrics.services.export import export_daily_metric_payload, load_daily_metric_payload +from metrics.services.resources import build_search_client, fetch_required_resources +from metrics.services.parser import process_daily_metric_job + + +@celery_app.task(bind=True, name=_("[Metrics] Process Daily Job"), timelimit=-1) +def task_process_daily_metric_job( + self, + job_id, + track_errors=False, + user_id=None, + username=None, + robots_source=None, +): + user = _get_user(self.request, username=username, user_id=user_id) + + try: + job = acquire_daily_metric_job(job_id) + except DailyMetricJob.DoesNotExist: + logging.error("Daily metric job %s does not exist.", job_id) + return + + if not job: + return + + try: + payload = load_daily_metric_payload(job) + if payload is None or not job.payload_hash: + robots_list, mmdb = fetch_required_resources(robot_source=robots_source) + if not robots_list or not mmdb: + raise RuntimeError("Required parsing resources are not available.") + payload = process_daily_metric_job( + job=job, + robots_list=robots_list, + mmdb=mmdb, + track_errors=track_errors, + ) + job.refresh_from_db() + + search_client = build_search_client() + if not search_client.ping(): + raise RuntimeError("OpenSearch client is not available.") + + export_daily_metric_payload( + search_client=search_client, + job=job, + payload=payload, + ) + except Exception as exc: + logging.error("Failed to process daily metric job %s: %s", job_id, exc) + mark_daily_metric_job_failed(job, exc) + return + + mark_daily_metric_job_exported(job, user=user) diff --git a/metrics/tasks/resume.py b/metrics/tasks/resume.py new file mode 100644 index 0000000..c0fe705 --- /dev/null +++ b/metrics/tasks/resume.py @@ -0,0 +1,166 @@ +import logging + +from django.utils import timezone +from django.utils.translation import gettext as _ + +from config import celery_app +from core.utils.date_utils import get_date_obj, get_date_range_str +from core.utils.request_utils import _get_user +from log_manager import choices +from log_manager.models import LogFile +from metrics.models import DailyMetricJob + +from metrics.services.jobs import create_or_update_daily_metric_job, release_stale_daily_metric_jobs +from metrics.services.resources import extract_celery_queue_name, get_log_files_for_collection_date +from metrics.services.parser import is_stale_parsing_log, requeue_stale_parsing_log +from metrics.counter import parser + +from .parse import task_parse_logs +from .process import task_process_daily_metric_job + + +@celery_app.task(bind=True, name=_("[Metrics] Resume Log Exports"), timelimit=-1) +def task_resume_log_exports( + self, + collections=None, + from_date=None, + until_date=None, + days_to_go_back=None, + stale_after_minutes=60, + queue_name=None, + user_id=None, + username=None, + robots_source=None, +): + _get_user(self.request, username=username, user_id=user_id) + + from_date, until_date = get_date_range_str(from_date, until_date, days_to_go_back) + from_date_obj = get_date_obj(from_date) + until_date_obj = get_date_obj(until_date) + + released_stale_jobs = release_stale_daily_metric_jobs( + collections=collections, + from_date=from_date_obj, + until_date=until_date_obj, + stale_after_minutes=stale_after_minutes, + ) + queryset = DailyMetricJob.objects.filter( + status__in=[DailyMetricJob.STATUS_PENDING, DailyMetricJob.STATUS_ERROR], + access_date__gte=from_date_obj, + access_date__lte=until_date_obj, + ).select_related("collection").order_by("access_date", "collection__acron3") + if collections: + queryset = queryset.filter(collection__acron3__in=collections) + + resumed_jobs = 0 + for job in queryset: + log_files = get_log_files_for_collection_date( + collection=job.collection, + access_date=job.access_date, + status_filters=[ + choices.LOG_FILE_STATUS_QUEUED, + choices.LOG_FILE_STATUS_ERROR, + ], + ) + if log_files: + job = create_or_update_daily_metric_job( + collection=job.collection, + access_date=job.access_date, + log_files=log_files, + ) + elif not (job.storage_path and job.payload_hash): + logging.warning( + "Skipping daily metric job %s: no queued/error logs or stored payload.", + job.pk, + ) + continue + + if job.status == DailyMetricJob.STATUS_EXPORTED: + continue + + task_process_daily_metric_job.apply_async( + args=(job.pk, False, user_id, username, robots_source), + queue=queue_name or extract_celery_queue_name(job.collection.acron3), + ) + resumed_jobs += 1 + + logging.info( + "Resumed daily metric jobs for %s day(s); released %s stale job(s) at %s.", + resumed_jobs, + released_stale_jobs, + timezone.now(), + ) + return { + "resumed_logs": resumed_jobs, + "resumed_jobs": resumed_jobs, + "released_stale_batches": released_stale_jobs, + "released_stale_jobs": released_stale_jobs, + } + + +@celery_app.task(bind=True, name=_("[Metrics] Resume Stale Parsing Logs"), timelimit=-1) +def task_resume_stale_parsing_logs( + self, + collections=None, + batch_size=5000, + track_errors=False, + from_date=None, + until_date=None, + days_to_go_back=None, + stale_after_minutes=60, + max_log_files=None, + queue_name=None, + user_id=None, + username=None, + robots_source=None, +): + from_date, until_date = get_date_range_str(from_date, until_date, days_to_go_back) + from_date_obj = get_date_obj(from_date) + until_date_obj = get_date_obj(until_date) + + queryset = ( + LogFile.objects.filter(status=choices.LOG_FILE_STATUS_PARSING) + .select_related("collection") + .order_by("validation__probably_date", "path", "hash") + ) + if collections: + queryset = queryset.filter(collection__acron3__in=collections) + + resumed_logs = 0 + for log_file in queryset: + probably_date = parser.extract_date_from_validation_dict(log_file.validation) + if not probably_date or probably_date < from_date_obj or probably_date > until_date_obj: + continue + if not is_stale_parsing_log(log_file, stale_after_minutes=stale_after_minutes): + continue + + requeue_stale_parsing_log(log_file) + resumed_logs += 1 + if max_log_files and resumed_logs >= max_log_files: + break + + apply_kwargs = { + "kwargs": { + "collections": collections, + "include_logs_with_error": True, + "batch_size": batch_size, + "max_log_files": max_log_files, + "auto_reexecute": False, + "replace": False, + "track_errors": track_errors, + "from_date": from_date, + "until_date": until_date, + "days_to_go_back": None, + "queue_name": queue_name, + "user_id": user_id, + "username": username, + "robots_source": robots_source, + } + } + if queue_name: + apply_kwargs["queue"] = queue_name + task_parse_logs.apply_async(**apply_kwargs) + return { + "stale_logs_marked_for_retry": resumed_logs, + "parse_logs_enqueued": True, + } diff --git a/metrics/templates/search/indexes/metrics/top100articles_text.txt b/metrics/templates/search/indexes/metrics/top100articles_text.txt deleted file mode 100644 index ccf5e94..0000000 --- a/metrics/templates/search/indexes/metrics/top100articles_text.txt +++ /dev/null @@ -1,10 +0,0 @@ -{{ object.collection }} -{{ object.key_issn }} -{{ object.pid }} -{{ object.yop }} -{{ object.language }} -{{ object.country }} -{{ object.total_item_requests }} -{{ object.total_item_investigations }} -{{ object.unique_item_requests }} -{{ object.unique_item_investigations }} \ No newline at end of file diff --git a/metrics/tests/test_cleanup.py b/metrics/tests/test_cleanup.py new file mode 100644 index 0000000..e08fa9c --- /dev/null +++ b/metrics/tests/test_cleanup.py @@ -0,0 +1,283 @@ +import json +import os +import shutil +import tempfile +import time +from datetime import date +from pathlib import Path +from unittest.mock import patch + +from django.test import TestCase + +from collection.models import Collection +from metrics.models import DailyMetricJob +from metrics.services import daily_payloads + + +class CleanupExportedPayloadsTests(TestCase): + @classmethod + def setUpClass(cls): + super().setUpClass() + cls._tmpdir = tempfile.TemporaryDirectory() + cls._patched_root = patch.object( + daily_payloads, + "get_daily_payload_root", + return_value=Path(cls._tmpdir.name), + ) + cls._patched_root.start() + + @classmethod + def tearDownClass(cls): + cls._patched_root.stop() + cls._tmpdir.cleanup() + super().tearDownClass() + + def setUp(self): + self.collection = Collection.objects.create(acron3="books", acron2="bk") + self.other_collection = Collection.objects.create(acron3="scl", acron2="sc") + + self.payload_root = daily_payloads.get_daily_payload_root() + self._clean_temp_dir() + + def _clean_temp_dir(self): + root = self.payload_root + if root.exists(): + for item in root.iterdir(): + if item.is_dir(): + shutil.rmtree(item) + else: + item.unlink() + + def _create_job(self, collection, access_date, status, storage_path, payload_hash): + return DailyMetricJob.objects.create( + collection=collection, + access_date=access_date, + status=status, + storage_path=storage_path, + payload_hash=payload_hash, + ) + + def _write_payload_file(self, storage_path): + resolved = daily_payloads.resolve_storage_path(storage_path) + resolved.parent.mkdir(parents=True, exist_ok=True) + resolved.write_text(json.dumps({"test": True}), encoding="utf-8") + return resolved + + def _set_file_age(self, file_path, days_old): + old_time = time.time() - days_old * 86400 + os.utime(file_path, (old_time, old_time)) + + def test_cleanup_deletes_old_exported_payloads(self): + path = daily_payloads.build_daily_storage_path( + self.collection, date(2012, 3, 10) + ) + resolved = self._write_payload_file(path) + self._set_file_age(resolved, 30) + + self._create_job( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_EXPORTED, + storage_path=path.as_posix(), + payload_hash="abc", + ) + + result = daily_payloads.cleanup_exported_payloads(older_than_days=7) + + self.assertEqual(result, 1) + self.assertFalse(resolved.exists()) + + def test_cleanup_skips_recent_files(self): + path = daily_payloads.build_daily_storage_path( + self.collection, date(2012, 3, 10) + ) + resolved = self._write_payload_file(path) + + self._create_job( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_EXPORTED, + storage_path=path.as_posix(), + payload_hash="abc", + ) + + result = daily_payloads.cleanup_exported_payloads(older_than_days=7) + + self.assertEqual(result, 0) + self.assertTrue(resolved.exists()) + + def test_cleanup_skips_non_exported_jobs(self): + statuses = [ + DailyMetricJob.STATUS_PENDING, + DailyMetricJob.STATUS_ERROR, + DailyMetricJob.STATUS_EXPORTING, + ] + paths = [] + for i, status in enumerate(statuses): + access_date = date(2012, 3, 10 + i) + path = daily_payloads.build_daily_storage_path( + self.collection, access_date + ) + resolved = self._write_payload_file(path) + self._set_file_age(resolved, 30) + paths.append(resolved) + + self._create_job( + collection=self.collection, + access_date=access_date, + status=status, + storage_path=path.as_posix(), + payload_hash="abc", + ) + + result = daily_payloads.cleanup_exported_payloads(older_than_days=7) + + self.assertEqual(result, 0) + for p in paths: + self.assertTrue(p.exists()) + + def test_cleanup_filters_by_collection(self): + path_books = daily_payloads.build_daily_storage_path( + self.collection, date(2012, 3, 10) + ) + path_scl = daily_payloads.build_daily_storage_path( + self.other_collection, date(2012, 3, 10) + ) + resolved_books = self._write_payload_file(path_books) + resolved_scl = self._write_payload_file(path_scl) + self._set_file_age(resolved_books, 30) + self._set_file_age(resolved_scl, 30) + + self._create_job( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_EXPORTED, + storage_path=path_books.as_posix(), + payload_hash="abc", + ) + self._create_job( + collection=self.other_collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_EXPORTED, + storage_path=path_scl.as_posix(), + payload_hash="def", + ) + + result = daily_payloads.cleanup_exported_payloads( + collections=["books"], + older_than_days=7, + ) + + self.assertEqual(result, 1) + self.assertFalse(resolved_books.exists()) + self.assertTrue(resolved_scl.exists()) + + def test_cleanup_deletes_orphan_files(self): + path = daily_payloads.build_daily_storage_path( + self.collection, date(2012, 3, 10) + ) + resolved = self._write_payload_file(path) + self._set_file_age(resolved, 30) + + result = daily_payloads.cleanup_exported_payloads(older_than_days=7) + + self.assertEqual(result, 1) + self.assertFalse(resolved.exists()) + + def test_cleanup_skips_orphan_file_with_old_db_job_not_exported(self): + path = daily_payloads.build_daily_storage_path( + self.collection, date(2012, 3, 10) + ) + resolved = self._write_payload_file(path) + self._set_file_age(resolved, 30) + + self._create_job( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_PENDING, + storage_path=path.as_posix(), + payload_hash="abc", + ) + + result = daily_payloads.cleanup_exported_payloads(older_than_days=7) + + self.assertEqual(result, 0) + self.assertTrue(resolved.exists()) + + def test_cleanup_clears_db_fields_for_exported_jobs(self): + path = daily_payloads.build_daily_storage_path( + self.collection, date(2012, 3, 10) + ) + resolved = self._write_payload_file(path) + self._set_file_age(resolved, 30) + + job = self._create_job( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_EXPORTED, + storage_path=path.as_posix(), + payload_hash="abc", + ) + + daily_payloads.cleanup_exported_payloads(older_than_days=7) + + job.refresh_from_db() + self.assertEqual(job.storage_path, "") + self.assertEqual(job.payload_hash, "") + + def test_cleanup_with_no_matching_files(self): + result = daily_payloads.cleanup_exported_payloads(older_than_days=7) + self.assertEqual(result, 0) + + def test_cleanup_without_older_than_days_deletes_all(self): + path = daily_payloads.build_daily_storage_path( + self.collection, date(2012, 3, 10) + ) + resolved = self._write_payload_file(path) + + self._create_job( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_EXPORTED, + storage_path=path.as_posix(), + payload_hash="abc", + ) + + result = daily_payloads.cleanup_exported_payloads(older_than_days=0) + + self.assertEqual(result, 1) + self.assertFalse(resolved.exists()) + + +class CleanupTaskTests(TestCase): + def setUp(self): + self.collection = Collection.objects.create(acron3="books", acron2="bk") + + def test_task_cleanup_daily_payloads_calls_service(self): + with patch("metrics.services.daily_payloads.cleanup_exported_payloads") as mock_cleanup: + mock_cleanup.return_value = 5 + from metrics.tasks import task_cleanup_daily_payloads + + result = task_cleanup_daily_payloads.run( + collections=["books"], + older_than_days=7, + ) + + mock_cleanup.assert_called_once_with( + collections=["books"], + older_than_days=7, + ) + self.assertEqual(result, {"deleted_payloads": 5}) + + def test_task_cleanup_with_defaults(self): + with patch("metrics.services.daily_payloads.cleanup_exported_payloads") as mock_cleanup: + mock_cleanup.return_value = 0 + from metrics.tasks import task_cleanup_daily_payloads + + result = task_cleanup_daily_payloads.run() + + mock_cleanup.assert_called_once_with( + collections=[], + older_than_days=7, + ) + self.assertEqual(result, {"deleted_payloads": 0}) diff --git a/metrics/tests/test_daily_jobs.py b/metrics/tests/test_daily_jobs.py new file mode 100644 index 0000000..f31b410 --- /dev/null +++ b/metrics/tests/test_daily_jobs.py @@ -0,0 +1,162 @@ +from datetime import date, timedelta + +from django.contrib.auth import get_user_model +from django.test import TestCase +from django.utils import timezone +from scielo_usage_counter.values import CONTENT_TYPE_FULL_TEXT, MEDIA_FORMAT_HTML + +from collection.models import Collection +from log_manager import choices +from log_manager.models import LogFile +from metrics.models import DailyMetricJob +from metrics import services + + +class DailyMetricJobServiceTests(TestCase): + def setUp(self): + self.collection = Collection.objects.create(acron3="books", acron2="bk") + + def _log_file(self, hash_value, status=choices.LOG_FILE_STATUS_QUEUED): + return LogFile.objects.create( + hash=hash_value, + path=f"/tmp/{hash_value}.log.gz", + stat_result={}, + status=status, + collection=self.collection, + validation={"probably_date": "2012-03-10"}, + ) + + def test_create_or_update_blocks_implicit_recompute_after_export(self): + first = self._log_file("1" * 32) + second = self._log_file("2" * 32) + DailyMetricJob.objects.create( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_EXPORTED, + input_log_hashes=[first.hash], + storage_path="books/2012/03/2012-03-10.json", + payload_hash="abc", + ) + + with self.assertRaises(RuntimeError): + services.create_or_update_daily_metric_job( + collection=self.collection, + access_date=date(2012, 3, 10), + log_files=[first, second], + ) + + def test_create_or_update_keeps_payload_for_export_retry(self): + log_file = self._log_file("1" * 32, status=choices.LOG_FILE_STATUS_ERROR) + job = DailyMetricJob.objects.create( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_ERROR, + input_log_hashes=[log_file.hash], + storage_path="books/2012/03/2012-03-10.json", + payload_hash="abc", + summary={"month_document_count": 1}, + ) + + services.create_or_update_daily_metric_job( + collection=self.collection, + access_date=date(2012, 3, 10), + log_files=[log_file], + ) + + job.refresh_from_db() + self.assertEqual(job.status, DailyMetricJob.STATUS_PENDING) + self.assertEqual(job.storage_path, "books/2012/03/2012-03-10.json") + self.assertEqual(job.payload_hash, "abc") + self.assertEqual(job.summary, {"month_document_count": 1}) + + def test_create_or_update_clears_stale_payload_when_inputs_change_before_success(self): + first = self._log_file("1" * 32) + second = self._log_file("2" * 32) + job = DailyMetricJob.objects.create( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_ERROR, + input_log_hashes=[first.hash], + storage_path="books/2012/03/2012-03-10.json", + payload_hash="abc", + summary={"month_document_count": 1}, + ) + + services.create_or_update_daily_metric_job( + collection=self.collection, + access_date=date(2012, 3, 10), + log_files=[first, second], + ) + + job.refresh_from_db() + self.assertEqual(job.input_log_hashes, sorted([first.hash, second.hash])) + self.assertEqual(job.storage_path, "") + self.assertEqual(job.payload_hash, "") + self.assertEqual(job.summary, {}) + + def test_release_stale_daily_metric_jobs_marks_logs_for_retry(self): + log_file = self._log_file("1" * 32, status=choices.LOG_FILE_STATUS_PARSING) + DailyMetricJob.objects.create( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_EXPORTING, + input_log_hashes=[log_file.hash], + export_started_at=timezone.now() - timedelta(minutes=120), + ) + + released = services.release_stale_daily_metric_jobs(stale_after_minutes=60) + + log_file.refresh_from_db() + self.assertEqual(released, 1) + self.assertEqual(log_file.status, choices.LOG_FILE_STATUS_ERROR) + self.assertIsNone(log_file.parse_heartbeat_at) + + def test_process_line_discards_invalid_local_datetime_without_raising(self): + class FakeUtm: + def translate(self, url): + return { + "book_id": "q7gtd", + "pid_generic": "book:q7gtd", + "media_language": "en", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + } + + log_file = self._log_file("1" * 32) + results = {} + + is_valid, error = services.process_line( + results=results, + line={ + "url": "/id/q7gtd", + "client_name": "browser", + "client_version": "1.0", + "ip_address": "127.0.0.1", + "country_code": "BR", + "local_datetime": None, + }, + utm=FakeUtm(), + log_file=log_file, + ) + + self.assertFalse(is_valid) + self.assertIsNone(error) + self.assertEqual(results, {}) + + def test_mark_daily_metric_job_exported_records_updated_by(self): + user = get_user_model().objects.create_user( + username="tester", + email="tester@example.org", + password="secret", + ) + job = DailyMetricJob.objects.create( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_EXPORTING, + ) + + services.mark_daily_metric_job_exported(job, user=user) + + job.refresh_from_db() + self.assertEqual(job.status, DailyMetricJob.STATUS_EXPORTED) + self.assertIsNotNone(job.exported_at) diff --git a/metrics/tests/test_index_utils.py b/metrics/tests/test_index_utils.py index 47f1a0e..562fc42 100644 --- a/metrics/tests/test_index_utils.py +++ b/metrics/tests/test_index_utils.py @@ -1,104 +1,894 @@ +import csv import unittest +from datetime import datetime +from pathlib import Path +from tempfile import TemporaryDirectory from scielo_usage_counter.values import ( - MEDIA_FORMAT_UNDEFINED, - MEDIA_FORMAT_PDF, - MEDIA_FORMAT_HTML, - CONTENT_TYPE_UNDEFINED, - CONTENT_TYPE_FULL_TEXT, CONTENT_TYPE_ABSTRACT, + CONTENT_TYPE_FULL_TEXT, + CONTENT_TYPE_UNDEFINED, DEFAULT_SCIELO_ISSN, + MEDIA_FORMAT_HTML, + MEDIA_FORMAT_PDF, + MEDIA_FORMAT_UNDEFINED, ) -from metrics.utils import index_utils +from metrics.counter import access, documents as index_docs +from metrics.opensearch.names import generate_month_index_name, generate_year_index_name -class TestIndexUtils(unittest.TestCase): +class TestIndexUtils(unittest.TestCase): def test_is_valid_item_access_data_valid(self): data = { - 'scielo_issn': '1234-5678', - 'pid_v2': 'S0102-67202020000100001', - 'pid_v3': 'jGJccQ7bFdbz6wy3nfXGVdv', - 'media_format': MEDIA_FORMAT_PDF, - 'content_type': CONTENT_TYPE_FULL_TEXT, + "scielo_issn": "1234-5678", + "pid_v2": "S0102-67202020000100001", + "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", + "media_language": "en", + "media_format": MEDIA_FORMAT_PDF, + "content_type": CONTENT_TYPE_FULL_TEXT, } - result, _ = index_utils.is_valid_item_access_data(data) + result, _ = access.is_valid_item_access_data(data) self.assertTrue(result) def test_is_valid_item_access_data_missing_scielo_issn(self): data = { - 'scielo_issn': '', - 'pid_v2': 'S0102-67202020000100001', - 'pid_v3': 'jGJccQ7bFdbz6wy3nfXGVdv', - 'media_format': MEDIA_FORMAT_PDF, - 'content_type': CONTENT_TYPE_FULL_TEXT, + "scielo_issn": "", + "pid_v2": "S0102-67202020000100001", + "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", + "media_language": "en", + "media_format": MEDIA_FORMAT_PDF, + "content_type": CONTENT_TYPE_FULL_TEXT, } - result, _ = index_utils.is_valid_item_access_data(data) + result, _ = access.is_valid_item_access_data(data) self.assertFalse(result) + def test_is_valid_item_access_data_valid_book_source(self): + data = { + "source_type": "book", + "source_id": "q7gtd", + "scielo_issn": DEFAULT_SCIELO_ISSN, + "pid_generic": "BOOK:Q7GTD", + "media_language": "en", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + } + result, _ = access.is_valid_item_access_data(data) + self.assertTrue(result) + def test_is_valid_item_access_data_undefined_media_format(self): data = { - 'scielo_issn': '1234-5678', - 'pid_v2': 'S0102-67202020000100001', - 'pid_v3': 'jGJccQ7bFdbz6wy3nfXGVdv', - 'media_format': MEDIA_FORMAT_UNDEFINED, - 'content_type': CONTENT_TYPE_FULL_TEXT, + "scielo_issn": "1234-5678", + "pid_v2": "S0102-67202020000100001", + "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", + "media_language": "en", + "media_format": MEDIA_FORMAT_UNDEFINED, + "content_type": CONTENT_TYPE_FULL_TEXT, } - result, _ = index_utils.is_valid_item_access_data(data) + result, _ = access.is_valid_item_access_data(data) self.assertFalse(result) def test_is_valid_item_access_data_undefined_content_type(self): data = { - 'scielo_issn': '1234-5678', - 'pid_v2': 'S0102-67202020000100001', - 'pid_v3': 'jGJccQ7bFdbz6wy3nfXGVdv', - 'media_format': MEDIA_FORMAT_PDF, - 'content_type': CONTENT_TYPE_UNDEFINED, + "scielo_issn": "1234-5678", + "pid_v2": "S0102-67202020000100001", + "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", + "media_language": "en", + "media_format": MEDIA_FORMAT_PDF, + "content_type": CONTENT_TYPE_UNDEFINED, } - result, _ = index_utils.is_valid_item_access_data(data) + result, _ = access.is_valid_item_access_data(data) self.assertFalse(result) def test_is_valid_item_access_data_missing_pid_v2_and_pid_v3(self): data = { - 'scielo_issn': '1234-5678', - 'pid_v2': '', - 'pid_v3': '', - 'media_format': MEDIA_FORMAT_PDF, - 'content_type': CONTENT_TYPE_FULL_TEXT, + "scielo_issn": "1234-5678", + "pid_v2": "", + "pid_v3": "", + "media_language": "en", + "media_format": MEDIA_FORMAT_PDF, + "content_type": CONTENT_TYPE_FULL_TEXT, } - result, _ = index_utils.is_valid_item_access_data(data) + result, _ = access.is_valid_item_access_data(data) self.assertFalse(result) def test_is_valid_item_access_data_media_format_html(self): data = { - 'scielo_issn': '1234-5678', - 'pid_v2': 'S0102-67202020000100001', - 'pid_v3': 'jGJccQ7bFdbz6wy3nfXGVdv', - 'media_format': MEDIA_FORMAT_HTML, - 'content_type': CONTENT_TYPE_FULL_TEXT, + "scielo_issn": "1234-5678", + "pid_v2": "S0102-67202020000100001", + "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", + "media_language": "en", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, } - result, _ = index_utils.is_valid_item_access_data(data) + result, _ = access.is_valid_item_access_data(data) self.assertTrue(result) def test_is_valid_item_access_data_content_type_abstract(self): data = { - 'scielo_issn': '1234-5678', - 'pid_v2': 'S0102-67202020000100001', - 'pid_v3': 'jGJccQ7bFdbz6wy3nfXGVdv', - 'media_format': MEDIA_FORMAT_PDF, - 'content_type': CONTENT_TYPE_ABSTRACT + "scielo_issn": "1234-5678", + "pid_v2": "S0102-67202020000100001", + "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", + "media_language": "en", + "media_format": MEDIA_FORMAT_PDF, + "content_type": CONTENT_TYPE_ABSTRACT, } - result, _ = index_utils.is_valid_item_access_data(data) + result, _ = access.is_valid_item_access_data(data) self.assertTrue(result) - def test_is_valid_item_acess_data_dataverse(self): + def test_is_valid_item_access_data_dataset_without_source_or_language_is_valid(self): data = { - 'scielo_issn': DEFAULT_SCIELO_ISSN, - 'pid_v2': None, - 'pid_v3': None, - 'pid_generic': 'DOI:10.48331/SCIELODATA.JLMAIY', - 'media_format': MEDIA_FORMAT_HTML, - 'content_type': CONTENT_TYPE_ABSTRACT, + "document_type": "dataset", + "scielo_issn": DEFAULT_SCIELO_ISSN, + "pid_v2": None, + "pid_v3": None, + "pid_generic": "DOI:10.48331/SCIELODATA.JLMAIY", + "media_language": "un", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_ABSTRACT, } - result, _ = index_utils.is_valid_item_access_data(data) + result, _ = access.is_valid_item_access_data(data) self.assertTrue(result) + + def test_is_valid_item_access_data_missing_media_language_is_invalid(self): + data = { + "scielo_issn": "1234-5678", + "pid_v2": "S0102-67202020000100001", + "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", + "media_language": "", + "media_format": MEDIA_FORMAT_PDF, + "content_type": CONTENT_TYPE_FULL_TEXT, + } + result, _ = access.is_valid_item_access_data(data) + self.assertFalse(result) + + def test_extract_item_access_data_normalizes_source_fields_for_journal(self): + data = access.extract_item_access_data( + "scl", + { + "scielo_issn": "1234-5678", + "pid_v2": "S0102-67202020000100001", + "media_language": "en", + "media_format": MEDIA_FORMAT_PDF, + "content_type": CONTENT_TYPE_FULL_TEXT, + "publication_year": "2024", + "journal_main_title": "Journal Title", + "journal_subject_area_capes": ["Health Sciences"], + "journal_subject_area_wos": ["Medicine"], + "journal_acronym": "testjou", + "journal_publisher_name": ["SciELO"], + }, + ) + + self.assertEqual(data["source_type"], "journal") + self.assertEqual(data["source_id"], "1234-5678") + self.assertEqual(data["source_main_title"], "Journal Title") + self.assertEqual(data["source_acronym"], "testjou") + + def test_extract_item_access_data_normalizes_source_fields_for_books(self): + data = access.extract_item_access_data( + "books", + { + "book_id": "q7gtd", + "book_title": "Book Title", + "title_pid_generic": "book:q7gtd", + "pid_generic": "book:q7gtd/chapter:03", + "media_language": "en", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + "publication_year": "2023", + }, + ) + + self.assertEqual(data["source_type"], "book") + self.assertEqual(data["source_id"], "q7gtd") + self.assertEqual(data["scielo_issn"], DEFAULT_SCIELO_ISSN) + self.assertEqual(data["source_main_title"], "Book Title") + self.assertEqual(data["title_pid_generic"], "BOOK:Q7GTD") + + def test_extract_item_access_data_preserves_access_url_and_free_to_read(self): + data = access.extract_item_access_data( + "books", + { + "book_id": "c2248", + "book_title": "Book Title", + "title_pid_generic": "book:c2248", + "pid_generic": "book:c2248", + "media_language": "pt", + "media_format": MEDIA_FORMAT_PDF, + "content_type": CONTENT_TYPE_FULL_TEXT, + "access_url": "/id/c2248/pdf/freitas-9788599662830.pdf", + "source_access_type": "free_to_read", + }, + ) + + self.assertEqual(data["access_url"], "/id/c2248/pdf/freitas-9788599662830.pdf") + self.assertEqual(data["counter_access_type"], "Free_To_Read") + + def test_extract_item_access_data_tolerates_malformed_media_language(self): + data = access.extract_item_access_data( + "books", + { + "book_id": "q7gtd", + "pid_generic": "book:q7gtd", + "media_language": "'", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + }, + ) + + self.assertEqual(data["media_language"], "un") + + def test_extract_item_access_data_normalizes_scielo_collection_document_types(self): + preprint = access.extract_item_access_data( + "preprints", + { + "pid_generic": "10.1590/SciELOPreprints.1234", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + }, + ) + dataset = access.extract_item_access_data( + "data", + { + "pid_generic": "10.48331/scielodata.abc123", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_ABSTRACT, + }, + ) + article = access.extract_item_access_data( + "scl", + { + "scielo_issn": "1234-5678", + "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + }, + ) + + self.assertEqual(preprint["source_type"], "preprint_server") + self.assertEqual(preprint["document_type"], "preprint") + self.assertEqual(dataset["source_type"], "data_repository") + self.assertEqual(dataset["document_type"], "dataset") + self.assertEqual(article["source_type"], "journal") + self.assertEqual(article["document_type"], "article") + + def test_update_results_with_item_access_data_stores_source_and_periods(self): + results = {} + item_access_data = { + "collection": "books", + "source_type": "book", + "source_id": "q7gtd", + "scielo_issn": DEFAULT_SCIELO_ISSN, + "pid_v2": None, + "pid_v3": None, + "pid_generic": "BOOK:Q7GTD", + "title_pid_generic": "BOOK:Q7GTD", + "media_language": "en", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + "publication_year": "2023", + "source_main_title": "Book Title", + "source_subject_area_capes": [], + "source_subject_area_wos": [], + "source_acronym": None, + "source_publisher_name": ["SciELO Books"], + } + line = { + "client_name": "browser", + "client_version": "1.0", + "ip_address": "127.0.0.1", + "country_code": "BR", + "local_datetime": datetime(2024, 1, 15, 10, 0, 5), + } + + access.update_results_with_item_access_data(results, item_access_data, line) + + self.assertEqual(len(results), 1) + result = next(iter(results.values())) + self.assertEqual(result["source"]["source_type"], "book") + self.assertEqual(result["source"]["source_id"], "q7gtd") + self.assertEqual(result["source"]["main_title"], "Book Title") + self.assertEqual(result["access_date"], "2024-01-15") + self.assertEqual(result["access_month"], "202401") + self.assertEqual(result["access_year"], "2024") + self.assertEqual(result["access_country_code"], "BR") + self.assertEqual(result["content_language"], "en") + self.assertEqual(result["title_pid_generic"], "BOOK:Q7GTD") + self.assertIn("user_session_id", result) + + def test_update_results_with_item_access_data_rejects_invalid_local_datetime(self): + results = {} + item_access_data = { + "collection": "books", + "source_type": "book", + "source_id": "q7gtd", + "scielo_issn": DEFAULT_SCIELO_ISSN, + "pid_generic": "BOOK:Q7GTD", + "media_language": "en", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + } + line = { + "client_name": "browser", + "client_version": "1.0", + "ip_address": "127.0.0.1", + "country_code": "BR", + "local_datetime": None, + } + + with self.assertRaises(ValueError): + access.update_results_with_item_access_data(results, item_access_data, line) + + self.assertEqual(results, {}) + + def test_update_results_with_item_access_data_does_not_expand_book_into_segments(self): + results = {} + item_access_data = { + "collection": "books", + "source_type": "book", + "source_id": "c2248", + "scielo_issn": DEFAULT_SCIELO_ISSN, + "pid_v2": None, + "pid_v3": None, + "pid_generic": "BOOK:C2248", + "title_pid_generic": "BOOK:C2248", + "segment_pid_generics": [ + "BOOK:C2248/CHAPTER:00", + "BOOK:C2248/CHAPTER:01", + "BOOK:C2248/CHAPTER:02", + ], + "media_language": "pt", + "media_format": MEDIA_FORMAT_PDF, + "content_type": CONTENT_TYPE_FULL_TEXT, + "publication_year": "2018", + "source_main_title": "C2248 Book", + } + line = { + "client_name": "browser", + "client_version": "1.0", + "ip_address": "127.0.0.1", + "country_code": "BR", + "local_datetime": datetime(2024, 1, 15, 10, 0, 5), + } + + access.update_results_with_item_access_data(results, item_access_data, line) + + self.assertEqual(len(results), 1) + result = list(results.values())[0] + self.assertEqual(result["pid_generic"], "BOOK:C2248") + + def test_double_click_filter_uses_url_bucket_for_same_item(self): + results = {} + item_access_data = { + "collection": "books", + "source_type": "book", + "source_id": "c2248", + "scielo_issn": DEFAULT_SCIELO_ISSN, + "pid_v2": None, + "pid_v3": None, + "pid_generic": "BOOK:C2248/CHAPTER:03", + "title_pid_generic": "BOOK:C2248", + "media_language": "pt", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + "publication_year": "2018", + "source_main_title": "C2248 Book", + } + base_line = { + "client_name": "browser", + "client_version": "1.0", + "ip_address": "127.0.0.1", + "country_code": "BR", + } + + access.update_results_with_item_access_data( + results, + item_access_data, + { + **base_line, + "local_datetime": datetime(2024, 1, 15, 10, 0, 5), + "url": "/id/c2248/03", + }, + ) + access.update_results_with_item_access_data( + results, + item_access_data, + { + **base_line, + "local_datetime": datetime(2024, 1, 15, 10, 0, 20), + "url": "https://books.scielo.org/id/c2248/epub/03.html?x=1", + }, + ) + + raw = next(iter(results.values())) + self.assertEqual( + set(raw["click_timestamps_by_url"]), + {"/id/c2248/03", "/id/c2248/epub/03.html"}, + ) + + metrics_data = index_docs.convert_raw_results_to_index_documents(results) + month_item = metrics_data["month"]["books|c2248|||BOOK:C2248/CHAPTER:03|2024-01|Open|Regular|2018"] + + self.assertEqual(month_item["total_requests"], 2) + self.assertEqual(month_item["unique_requests"], 1) + + def test_double_click_filter_collapses_same_url_within_30_seconds(self): + results = {} + item_access_data = { + "collection": "books", + "source_type": "book", + "source_id": "c2248", + "scielo_issn": DEFAULT_SCIELO_ISSN, + "pid_v2": None, + "pid_v3": None, + "pid_generic": "BOOK:C2248/CHAPTER:03", + "title_pid_generic": "BOOK:C2248", + "media_language": "pt", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + "publication_year": "2018", + "source_main_title": "C2248 Book", + } + base_line = { + "client_name": "browser", + "client_version": "1.0", + "ip_address": "127.0.0.1", + "country_code": "BR", + "url": "/id/c2248/03?from=search", + } + + access.update_results_with_item_access_data( + results, + item_access_data, + {**base_line, "local_datetime": datetime(2024, 1, 15, 10, 0, 5)}, + ) + access.update_results_with_item_access_data( + results, + item_access_data, + {**base_line, "local_datetime": datetime(2024, 1, 15, 10, 0, 20)}, + ) + + raw = next(iter(results.values())) + self.assertEqual( + raw["click_timestamps_by_url"], + {"/id/c2248/03": {"00:05": 1, "00:20": 1}}, + ) + + metrics_data = index_docs.convert_raw_results_to_index_documents(results) + month_item = metrics_data["month"]["books|c2248|||BOOK:C2248/CHAPTER:03|2024-01|Open|Regular|2018"] + + self.assertEqual(month_item["total_requests"], 1) + self.assertEqual(month_item["unique_requests"], 1) + + def test_generate_index_names_for_year_and_month(self): + self.assertEqual( + generate_year_index_name("usage", "scl", "2024-01-15"), + "usage_yearly_scl_2024", + ) + self.assertEqual( + generate_month_index_name("usage", "scl", "2024-01-15"), + "usage_monthly_scl_2024", + ) + self.assertEqual( + generate_year_index_name("usage", "books", "2024-01-15"), + "usage_yearly_books", + ) + self.assertEqual( + generate_month_index_name("usage", "books", "2024-01-15"), + "usage_monthly_books", + ) + + def test_convert_raw_results_to_index_documents_creates_month_and_year_views(self): + data = { + "books|q7gtd|||BOOK:Q7GTD/CHAPTER:03|browser|1.0|127.0.0.1|BR|en|html|full_text": { + "collection": "books", + "source_key": "q7gtd", + "document_type": "chapter", + "pid_v2": None, + "pid_v3": None, + "pid_generic": "BOOK:Q7GTD/CHAPTER:03", + "title_pid_generic": "BOOK:Q7GTD", + "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", + "click_timestamps": {"00:05": 1}, + "access_country_code": "BR", + "content_language": "en", + "content_type": CONTENT_TYPE_FULL_TEXT, + "access_date": "2024-01-15", + "access_month": "202401", + "access_year": "2024", + "source": { + "source_type": "book", + "source_id": "q7gtd", + "scielo_issn": DEFAULT_SCIELO_ISSN, + "main_title": "Book Title", + "identifiers": { + "book_id": "q7gtd", + "isbn": "9788578791889", + }, + "city": "Sao Paulo", + "country": "BR", + "subject_area_capes": [], + "subject_area_wos": [], + "acronym": None, + "publisher_name": ["SciELO Books"], + }, + "publication_year": "2023", + } + } + + metrics_data = index_docs.convert_raw_results_to_index_documents(data) + + self.assertEqual(set(metrics_data.keys()), {"month", "year"}) + self.assertEqual(len(metrics_data["month"]), 2) + self.assertEqual(len(metrics_data["year"]), 2) + + month_item = metrics_data["month"]["books|q7gtd|||BOOK:Q7GTD/CHAPTER:03|2024-01|Open|Regular|2023"] + self.assertEqual(month_item["access_month"], "2024-01") + self.assertNotIn("access_country_code", month_item) + self.assertNotIn("content_language", month_item) + self.assertEqual(month_item["document_type"], "chapter") + self.assertEqual(month_item["metric_scope"], "item") + self.assertEqual(month_item["counter_data_type"], "Book_Segment") + self.assertEqual(month_item["title_pid_generic"], "BOOK:Q7GTD") + self.assertEqual(month_item["total_requests"], 1) + self.assertEqual(month_item["unique_requests"], 1) + self.assertNotIn("scielo_issn", month_item["source"]) + self.assertEqual(month_item["source"]["identifiers"]["book_id"], "q7gtd") + self.assertEqual(month_item["source"]["publisher"], ["SciELO Books"]) + + month_title = metrics_data["month"]["title|books|q7gtd|||BOOK:Q7GTD|2024-01|Open|Regular|2023"] + self.assertEqual(month_title["document_type"], "book") + self.assertEqual(month_title["metric_scope"], "title") + self.assertEqual(month_title["counter_data_type"], "Book") + self.assertEqual(month_title["pid_generic"], "BOOK:Q7GTD") + self.assertEqual(month_title["total_requests"], 1) + self.assertEqual(month_title["total_investigations"], 1) + self.assertEqual(month_title["unique_requests"], 1) + self.assertEqual(month_title["unique_investigations"], 1) + + year_item = metrics_data["year"][ + "books|q7gtd|||BOOK:Q7GTD/CHAPTER:03|en|BR|2024|Open|Regular|2023" + ] + self.assertEqual(year_item["access_year"], "2024") + self.assertEqual(year_item["access_country_code"], "BR") + self.assertEqual(year_item["content_language"], "en") + self.assertEqual(year_item["metric_scope"], "item") + self.assertEqual(year_item["total_requests"], 1) + + year_title = metrics_data["year"][ + "title|books|q7gtd|||BOOK:Q7GTD|en|BR|2024|Open|Regular|2023" + ] + self.assertEqual(year_title["metric_scope"], "title") + self.assertEqual(year_title["total_requests"], 1) + self.assertEqual(year_title["total_investigations"], 1) + self.assertEqual(year_title["unique_requests"], 1) + self.assertEqual(year_title["unique_investigations"], 1) + + def test_convert_raw_results_to_index_documents_maps_counter_data_types(self): + data = { + "preprints|scielo-preprints|||10.1590/SCIELOPREPRINTS.1234|sess|BR|un|html|full_text": { + "collection": "preprints", + "source_key": "scielo-preprints", + "document_type": "preprint", + "pid_generic": "10.1590/SCIELOPREPRINTS.1234", + "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", + "click_timestamps": {"00:05": 1}, + "access_country_code": "BR", + "content_language": "un", + "content_type": CONTENT_TYPE_FULL_TEXT, + "access_date": "2024-01-15", + "access_year": "2024", + "source": { + "source_type": "preprint_server", + "source_id": "scielo-preprints", + "main_title": "SciELO Preprints", + }, + "publication_year": "2024", + }, + "data|scielo-data|||10.48331/SCIELODATA.ABC123|sess|BR|un|html|abstract": { + "collection": "data", + "source_key": "scielo-data", + "document_type": "dataset", + "pid_generic": "10.48331/SCIELODATA.ABC123", + "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", + "click_timestamps": {"00:05": 1}, + "access_country_code": "BR", + "content_language": "un", + "content_type": CONTENT_TYPE_ABSTRACT, + "access_date": "2024-01-15", + "access_year": "2024", + "source": { + "source_type": "data_repository", + "source_id": "scielo-data", + "main_title": "SciELO Data", + }, + "publication_year": "2024", + }, + } + + metrics_data = index_docs.convert_raw_results_to_index_documents(data) + preprint_doc = metrics_data["month"][ + "preprints|scielo-preprints|||10.1590/SCIELOPREPRINTS.1234|2024-01|Open|Regular|2024" + ] + dataset_doc = metrics_data["month"][ + "data|scielo-data|||10.48331/SCIELODATA.ABC123|2024-01|Open|Regular|2024" + ] + + self.assertEqual(preprint_doc["counter_data_type"], "Article") + self.assertEqual(preprint_doc["scielo_document_type"], "preprint") + self.assertEqual(preprint_doc["article_version"], "Preprint") + self.assertEqual(dataset_doc["counter_data_type"], "Dataset") + self.assertIsNone(dataset_doc["article_version"]) + + def test_convert_raw_results_to_index_documents_dedupes_book_unique_item_across_formats(self): + data = { + "books|c2248|||BOOK:C2248/CHAPTER:03|sess|BR|pt|html|full_text": { + "collection": "books", + "source_key": "c2248", + "document_type": "chapter", + "pid_v2": None, + "pid_v3": None, + "pid_generic": "BOOK:C2248/CHAPTER:03", + "title_pid_generic": "BOOK:C2248", + "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", + "click_timestamps": {"00:05": 1}, + "access_country_code": "BR", + "content_language": "pt", + "content_type": CONTENT_TYPE_FULL_TEXT, + "access_date": "2024-01-15", + "access_month": "202401", + "access_year": "2024", + "source": { + "source_type": "book", + "source_id": "c2248", + "main_title": "C2248 Book", + "identifiers": {"book_id": "c2248", "isbn": "9788599662830"}, + "publisher_name": ["SciELO Books"], + }, + "publication_year": "2018", + }, + "books|c2248|||BOOK:C2248/CHAPTER:03|sess|BR|pt|pdf|full_text": { + "collection": "books", + "source_key": "c2248", + "document_type": "chapter", + "pid_v2": None, + "pid_v3": None, + "pid_generic": "BOOK:C2248/CHAPTER:03", + "title_pid_generic": "BOOK:C2248", + "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", + "click_timestamps": {"00:45": 1}, + "access_country_code": "BR", + "content_language": "pt", + "content_type": CONTENT_TYPE_FULL_TEXT, + "access_date": "2024-01-15", + "access_month": "202401", + "access_year": "2024", + "source": { + "source_type": "book", + "source_id": "c2248", + "main_title": "C2248 Book", + "identifiers": {"book_id": "c2248", "isbn": "9788599662830"}, + "publisher_name": ["SciELO Books"], + }, + "publication_year": "2018", + }, + } + + metrics_data = index_docs.convert_raw_results_to_index_documents(data) + + month_item = metrics_data["month"]["books|c2248|||BOOK:C2248/CHAPTER:03|2024-01|Open|Regular|2018"] + month_title = metrics_data["month"]["title|books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018"] + + self.assertEqual(month_item["total_requests"], 2) + self.assertEqual(month_item["total_investigations"], 2) + self.assertEqual(month_item["unique_requests"], 1) + self.assertEqual(month_item["unique_investigations"], 1) + self.assertEqual(month_title["unique_requests"], 1) + self.assertEqual(month_title["unique_investigations"], 1) + + def test_convert_raw_results_to_index_documents_skips_book_landing_page_from_item_scope(self): + data = { + "books|c2248|||BOOK:C2248|sess|BR|pt|html|abstract": { + "collection": "books", + "source_key": "c2248", + "document_type": "book", + "pid_v2": None, + "pid_v3": None, + "pid_generic": "BOOK:C2248", + "title_pid_generic": "BOOK:C2248", + "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", + "click_timestamps": {"00:05": 1}, + "access_country_code": "BR", + "content_language": "pt", + "content_type": CONTENT_TYPE_ABSTRACT, + "access_date": "2024-01-15", + "access_month": "202401", + "access_year": "2024", + "source": { + "source_type": "book", + "source_id": "c2248", + "main_title": "C2248 Book", + "identifiers": {"book_id": "c2248", "isbn": "9788599662830"}, + "publisher_name": ["SciELO Books"], + }, + "publication_year": "2018", + }, + } + + metrics_data = index_docs.convert_raw_results_to_index_documents(data) + + self.assertEqual( + set(metrics_data["month"].keys()), + {"title|books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018"}, + ) + self.assertEqual( + set(metrics_data["year"].keys()), + {"title|books|c2248|||BOOK:C2248|pt|BR|2024|Open|Regular|2018"}, + ) + + def test_convert_raw_results_to_index_documents_counts_whole_book_without_segments_as_book_segment(self): + data = { + "books|c2248|||BOOK:C2248|sess|BR|pt|pdf|full_text": { + "collection": "books", + "source_key": "c2248", + "document_type": "book", + "pid_v2": None, + "pid_v3": None, + "pid_generic": "BOOK:C2248", + "title_pid_generic": "BOOK:C2248", + "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", + "click_timestamps": {"00:05": 1}, + "access_country_code": "BR", + "content_language": "pt", + "content_type": CONTENT_TYPE_FULL_TEXT, + "access_date": "2024-01-15", + "access_month": "202401", + "access_year": "2024", + "source": { + "source_type": "book", + "source_id": "c2248", + "main_title": "C2248 Book", + "identifiers": {"book_id": "c2248"}, + "publisher_name": ["SciELO Books"], + }, + "publication_year": "2018", + }, + } + + metrics_data = index_docs.convert_raw_results_to_index_documents(data) + month_item = metrics_data["month"]["books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018"] + month_title = metrics_data["month"]["title|books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018"] + + self.assertEqual(month_item["counter_data_type"], "Book_Segment") + self.assertEqual(month_item["metric_scope"], "item") + self.assertEqual(month_title["counter_data_type"], "Book") + self.assertEqual(month_title["metric_scope"], "title") + + def test_convert_raw_results_aggregates_multiple_chapters_correctly(self): + """Test that accessing multiple chapters creates correct title-level totals""" + data = { + "books|q7gtd|||BOOK:Q7GTD/CHAPTER:01|session1|BR|en|html|full_text": { + "collection": "books", + "source_key": "q7gtd", + "document_type": "chapter", + "pid_generic": "BOOK:Q7GTD/CHAPTER:01", + "title_pid_generic": "BOOK:Q7GTD", + "user_session_id": "session1", + "click_timestamps": {"00:05": 1}, + "content_type": CONTENT_TYPE_FULL_TEXT, + "access_date": "2024-01-15", + "access_year": "2024", + "source": { + "source_type": "book", + "source_id": "q7gtd", + "scielo_issn": DEFAULT_SCIELO_ISSN, + "main_title": "Book Title", + "identifiers": {"book_id": "q7gtd"}, + "publisher_name": ["SciELO Books"], + }, + "publication_year": "2023", + }, + "books|q7gtd|||BOOK:Q7GTD/CHAPTER:02|session1|BR|en|html|full_text": { + "collection": "books", + "source_key": "q7gtd", + "document_type": "chapter", + "pid_generic": "BOOK:Q7GTD/CHAPTER:02", + "title_pid_generic": "BOOK:Q7GTD", + "user_session_id": "session1", # SAME SESSION + "click_timestamps": {"00:10": 1}, + "content_type": CONTENT_TYPE_FULL_TEXT, + "access_date": "2024-01-15", + "access_year": "2024", + "source": { + "source_type": "book", + "source_id": "q7gtd", + "scielo_issn": DEFAULT_SCIELO_ISSN, + "main_title": "Book Title", + "identifiers": {"book_id": "q7gtd"}, + "publisher_name": ["SciELO Books"], + }, + "publication_year": "2023", + }, + } + + metrics_data = index_docs.convert_raw_results_to_index_documents(data) + + # Should have 2 item documents (one per chapter) + 2 title documents (month and year) + self.assertEqual(len(metrics_data["month"]), 3) # 2 items + 1 title + self.assertEqual(len(metrics_data["year"]), 3) # 2 items + 1 title + + # Each item should have total=1, unique=1 + month_item_1 = metrics_data["month"]["books|q7gtd|||BOOK:Q7GTD/CHAPTER:01|2024-01|Open|Regular|2023"] + self.assertEqual(month_item_1["total_requests"], 1) + self.assertEqual(month_item_1["unique_requests"], 1) + + month_item_2 = metrics_data["month"]["books|q7gtd|||BOOK:Q7GTD/CHAPTER:02|2024-01|Open|Regular|2023"] + self.assertEqual(month_item_2["total_requests"], 1) + self.assertEqual(month_item_2["unique_requests"], 1) + + # Title should have total=2 (sum of both chapters) + # Title unique should be 1 (same session accessed book, counted once) + month_title = metrics_data["month"]["title|books|q7gtd|||BOOK:Q7GTD|2024-01|Open|Regular|2023"] + self.assertEqual(month_title["total_requests"], 2) + self.assertEqual(month_title["total_investigations"], 2) + self.assertEqual(month_title["unique_requests"], 1) + self.assertEqual(month_title["unique_investigations"], 1) + + def test_export_book_r51_monthly_metrics_writes_counter_title_columns(self): + from metrics.management.commands.export_book_r51_monthly_metrics import Command + + command = Command() + monthly_documents = command._build_monthly_documents( + { + "books|c2248|||BOOK:C2248/CHAPTER:03|sess|BR|pt|pdf|full_text": { + "collection": "books", + "source_key": "c2248", + "document_type": "chapter", + "pid_v2": None, + "pid_v3": None, + "pid_generic": "BOOK:C2248/CHAPTER:03", + "title_pid_generic": "BOOK:C2248", + "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", + "click_timestamps": {"00:05": 1}, + "access_country_code": "BR", + "content_language": "pt", + "content_type": CONTENT_TYPE_FULL_TEXT, + "access_date": "2024-01-15", + "access_year": "2024", + "source": { + "source_type": "book", + "source_id": "c2248", + "main_title": "C2248 Book", + "identifiers": {"book_id": "c2248"}, + "publisher_name": ["SciELO Books"], + }, + "publication_year": "2018", + } + } + ) + + with TemporaryDirectory() as tmpdir: + title_path = Path(tmpdir) / "title.csv" + command._write_title_csv(title_path, monthly_documents["title"]) + + with title_path.open(newline="") as fh: + reader = csv.DictReader(fh) + rows = list(reader) + + self.assertEqual( + reader.fieldnames, + [ + "year_month", + "title_pid_generic", + "document_type", + "total_item_requests", + "total_item_investigations", + "unique_title_requests", + "unique_title_investigations", + ], + ) + self.assertNotIn("total_title_requests", reader.fieldnames) + self.assertEqual(rows[0]["year_month"], "2024-01") + self.assertEqual(rows[0]["total_item_requests"], "1") + self.assertEqual(rows[0]["unique_title_requests"], "1") diff --git a/metrics/tests/test_opensearch.py b/metrics/tests/test_opensearch.py new file mode 100644 index 0000000..80586f9 --- /dev/null +++ b/metrics/tests/test_opensearch.py @@ -0,0 +1,92 @@ +from unittest import TestCase +from unittest.mock import Mock, patch + +from django.test import override_settings + +from metrics import opensearch + + +class OpenSearchUsageClientTests(TestCase): + @patch.object(opensearch.OpenSearchUsageClient, "get_opensearch_client") + def test_create_index_sends_mappings_in_request_body(self, mock_get_client): + mock_client = Mock() + mock_get_client.return_value = mock_client + + client = opensearch.OpenSearchUsageClient(url="https://example.org:9200") + client.create_index( + index_name="usage_monthly_books_202506", + mappings=opensearch.MONTH_INDEX_MAPPINGS, + ) + + mock_client.indices.create.assert_called_once_with( + index="usage_monthly_books_202506", + body={ + "settings": {"index": {"number_of_replicas": 0}}, + "mappings": opensearch.MONTH_INDEX_MAPPINGS, + }, + ) + + @override_settings( + OPENSEARCH_VERIFY_CERTS=True, + OPENSEARCH_BASIC_AUTH=None, + OPENSEARCH_API_KEY=None, + ) + @patch("metrics.opensearch.client.OpenSearch") + def test_verify_certs_false_explicitly_overrides_settings(self, mock_opensearch): + opensearch.OpenSearchUsageClient( + url="https://example.org:9200", + verify_certs=False, + ) + + mock_opensearch.assert_called_once_with( + "https://example.org:9200", + verify_certs=False, + ) + + def test_get_index_mappings_returns_books_specific_mappings(self): + self.assertIs( + opensearch.get_index_mappings("books", "month"), + opensearch.BOOKS_MONTH_INDEX_MAPPINGS, + ) + self.assertIs( + opensearch.get_index_mappings("books", "year"), + opensearch.BOOKS_YEAR_INDEX_MAPPINGS, + ) + self.assertIn("metric_scope", opensearch.BOOKS_MONTH_INDEX_MAPPINGS["properties"]) + self.assertIn("counter_data_type", opensearch.BOOKS_YEAR_INDEX_MAPPINGS["properties"]) + self.assertIn("title_pid_generic", opensearch.BOOKS_YEAR_INDEX_MAPPINGS["properties"]) + self.assertIn("applied_jobs", opensearch.BOOKS_MONTH_INDEX_MAPPINGS["properties"]) + + @patch("metrics.opensearch.client.helpers.bulk") + @patch.object(opensearch.OpenSearchUsageClient, "get_opensearch_client") + def test_increment_documents_for_daily_job_uses_applied_jobs( + self, + mock_get_client, + mock_bulk, + ): + mock_get_client.return_value = Mock() + client = opensearch.OpenSearchUsageClient(url="https://example.org:9200") + + client.increment_documents_for_daily_job( + index_name="usage_monthly_books_202506", + documents={ + "doc-1": { + "collection": "books", + "pid": "BOOK:WD", + "pid_generic": "BOOK:WD", + "access_date": "2025-06-03", + "total_requests": 3, + "total_investigations": 4, + "unique_requests": 2, + "unique_investigations": 3, + } + }, + job_id="books|2025-06-03|abc123", + ) + + actions = list(mock_bulk.call_args.args[1]) + self.assertEqual(len(actions), 1) + action = actions[0] + self.assertEqual(action["_op_type"], "update") + self.assertEqual(action["script"]["params"]["job_id"], "books|2025-06-03|abc123") + self.assertEqual(action["upsert"], {"applied_jobs": []}) diff --git a/metrics/tests/test_tasks.py b/metrics/tests/test_tasks.py new file mode 100644 index 0000000..932944f --- /dev/null +++ b/metrics/tests/test_tasks.py @@ -0,0 +1,268 @@ +from datetime import date, timedelta +from unittest.mock import patch + +from django.test import TestCase +from django.utils import timezone + +from collection.models import Collection +from log_manager import choices +from log_manager.models import LogFile +from metrics import tasks +from metrics.models import DailyMetricJob + + +class ParseLogsTaskTests(TestCase): + def setUp(self): + self.collection = Collection.objects.create(acron3="books", acron2="bk") + + def _log_file(self, hash_value, probably_date, status=choices.LOG_FILE_STATUS_QUEUED): + return LogFile.objects.create( + hash=hash_value, + path=f"/tmp/{hash_value}.log.gz", + stat_result={}, + status=status, + collection=self.collection, + date=date.fromisoformat(probably_date), + validation={"probably_date": probably_date}, + ) + + def test_task_parse_logs_enqueues_one_daily_job_per_collection_date(self): + first = self._log_file("1" * 32, "2012-03-10") + second = self._log_file("2" * 32, "2012-03-10") + third = self._log_file("3" * 32, "2012-03-15") + + with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async: + result = tasks.task_parse_logs.run( + collections=["books"], + include_logs_with_error=False, + from_date="2012-03-01", + until_date="2012-03-31", + ) + + self.assertEqual(result["enqueued_jobs"], 2) + self.assertEqual(mocked_apply_async.call_count, 2) + jobs = list(DailyMetricJob.objects.order_by("access_date")) + self.assertEqual([job.access_date for job in jobs], [date(2012, 3, 10), date(2012, 3, 15)]) + self.assertEqual(jobs[0].input_log_hashes, sorted([first.hash, second.hash])) + self.assertEqual(jobs[1].input_log_hashes, [third.hash]) + + def test_task_parse_logs_allows_queue_override_and_robots_source(self): + self._log_file("1" * 32, "2012-03-10") + + with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async: + tasks.task_parse_logs.run( + collections=["books"], + include_logs_with_error=False, + from_date="2012-03-01", + until_date="2012-03-31", + queue_name="parse_small_mult", + robots_source="counter", + ) + + mocked_apply_async.assert_called_once() + self.assertEqual(mocked_apply_async.call_args.kwargs["queue"], "parse_small_mult") + self.assertEqual(mocked_apply_async.call_args.kwargs["args"][-1], "counter") + + def test_task_parse_logs_skip_log_hashes_prevents_reprocessing_same_auto_run(self): + skipped = self._log_file("1" * 32, "2012-03-10", status=choices.LOG_FILE_STATUS_ERROR) + queued = self._log_file("2" * 32, "2012-03-11") + + with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async: + result = tasks.task_parse_logs.run( + collections=["books"], + include_logs_with_error=True, + from_date="2012-03-01", + until_date="2012-03-31", + skip_log_hashes=[skipped.hash], + ) + + mocked_apply_async.assert_called_once() + job = DailyMetricJob.objects.get() + self.assertEqual(job.input_log_hashes, [queued.hash]) + self.assertEqual(result["enqueued_jobs"], 1) + + def test_wait_parse_logs_wave_rechecks_until_daily_jobs_complete(self): + job = DailyMetricJob.objects.create( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_EXPORTING, + ) + + with patch("metrics.tasks.task_wait_parse_logs_wave.apply_async") as mocked_wait_apply_async: + with patch("metrics.tasks.task_parse_logs.apply_async") as mocked_parse_logs_apply_async: + result = tasks.task_wait_parse_logs_wave.run( + wave_log_hashes=[job.pk], + collections=["books"], + include_logs_with_error=False, + max_log_files=2, + auto_reexecute=True, + ) + + self.assertEqual(result, {"wave_completed": False, "reexecution_enqueued": False}) + mocked_parse_logs_apply_async.assert_not_called() + mocked_wait_apply_async.assert_called_once() + + +class ResumeDailyMetricJobTests(TestCase): + def setUp(self): + self.collection = Collection.objects.create(acron3="books", acron2="bk") + + def test_resume_log_exports_requeues_error_daily_jobs(self): + log_file = LogFile.objects.create( + hash="1" * 32, + path="/tmp/1.log.gz", + stat_result={}, + status=choices.LOG_FILE_STATUS_ERROR, + collection=self.collection, + date=date(2012, 3, 10), + ) + job = DailyMetricJob.objects.create( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_ERROR, + input_log_hashes=[log_file.hash], + ) + + with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async: + result = tasks.task_resume_log_exports.run( + collections=["books"], + from_date="2012-03-01", + until_date="2012-03-31", + queue_name="parse_small_mult", + ) + + mocked_apply_async.assert_called_once() + self.assertEqual(mocked_apply_async.call_args.kwargs["args"][0], job.pk) + self.assertEqual(mocked_apply_async.call_args.kwargs["queue"], "parse_small_mult") + self.assertEqual(result["resumed_logs"], 1) + + def test_resume_log_exports_clears_payload_when_current_logs_change(self): + log_file = LogFile.objects.create( + hash="2" * 32, + path="/tmp/2.log.gz", + stat_result={}, + status=choices.LOG_FILE_STATUS_QUEUED, + collection=self.collection, + date=date(2012, 3, 10), + ) + job = DailyMetricJob.objects.create( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_ERROR, + input_log_hashes=["1" * 32], + storage_path="books/2012/03/2012-03-10.json", + payload_hash="abc", + summary={"month_document_count": 1}, + ) + + with patch("metrics.tasks.task_process_daily_metric_job.apply_async"): + tasks.task_resume_log_exports.run( + collections=["books"], + from_date="2012-03-01", + until_date="2012-03-31", + ) + + job.refresh_from_db() + self.assertEqual(job.input_log_hashes, [log_file.hash]) + self.assertEqual(job.storage_path, "") + self.assertEqual(job.payload_hash, "") + self.assertEqual(job.summary, {}) + + def test_resume_log_exports_preserves_payload_when_current_logs_match(self): + log_file = LogFile.objects.create( + hash="1" * 32, + path="/tmp/1.log.gz", + stat_result={}, + status=choices.LOG_FILE_STATUS_ERROR, + collection=self.collection, + date=date(2012, 3, 10), + ) + job = DailyMetricJob.objects.create( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_ERROR, + input_log_hashes=[log_file.hash], + storage_path="books/2012/03/2012-03-10.json", + payload_hash="abc", + summary={"month_document_count": 1}, + ) + + with patch("metrics.tasks.task_process_daily_metric_job.apply_async"): + tasks.task_resume_log_exports.run( + collections=["books"], + from_date="2012-03-01", + until_date="2012-03-31", + ) + + job.refresh_from_db() + self.assertEqual(job.storage_path, "books/2012/03/2012-03-10.json") + self.assertEqual(job.payload_hash, "abc") + self.assertEqual(job.summary, {"month_document_count": 1}) + + def test_resume_log_exports_requeues_stored_payload_without_current_logs(self): + job = DailyMetricJob.objects.create( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_ERROR, + input_log_hashes=["1" * 32], + storage_path="books/2012/03/2012-03-10.json", + payload_hash="abc", + ) + + with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async: + result = tasks.task_resume_log_exports.run( + collections=["books"], + from_date="2012-03-01", + until_date="2012-03-31", + ) + + mocked_apply_async.assert_called_once() + self.assertEqual(mocked_apply_async.call_args.kwargs["args"][0], job.pk) + self.assertEqual(result["resumed_jobs"], 1) + + def test_resume_log_exports_skips_jobs_without_logs_or_payload(self): + DailyMetricJob.objects.create( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_ERROR, + ) + + with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async: + result = tasks.task_resume_log_exports.run( + collections=["books"], + from_date="2012-03-01", + until_date="2012-03-31", + ) + + mocked_apply_async.assert_not_called() + self.assertEqual(result["resumed_jobs"], 0) + + def test_resume_log_exports_releases_stale_exporting_jobs(self): + log_file = LogFile.objects.create( + hash="1" * 32, + path="/tmp/1.log.gz", + stat_result={}, + status=choices.LOG_FILE_STATUS_ERROR, + collection=self.collection, + date=date(2012, 3, 10), + ) + job = DailyMetricJob.objects.create( + collection=self.collection, + access_date=date(2012, 3, 10), + status=DailyMetricJob.STATUS_EXPORTING, + input_log_hashes=[log_file.hash], + export_started_at=timezone.now() - timedelta(minutes=120), + ) + + with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async: + result = tasks.task_resume_log_exports.run( + collections=["books"], + from_date="2012-03-01", + until_date="2012-03-31", + stale_after_minutes=60, + ) + + job.refresh_from_db() + self.assertEqual(job.status, DailyMetricJob.STATUS_PENDING) + mocked_apply_async.assert_called_once() + self.assertEqual(result["released_stale_batches"], 1) diff --git a/metrics/utils/__init__.py b/metrics/utils/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/metrics/utils/index_utils.py b/metrics/utils/index_utils.py deleted file mode 100644 index 76af8c2..0000000 --- a/metrics/utils/index_utils.py +++ /dev/null @@ -1,331 +0,0 @@ -from scielo_usage_counter.counter import compute_r5_metrics -from scielo_usage_counter.values import CONTENT_TYPE_UNDEFINED, MEDIA_FORMAT_UNDEFINED - -from core.utils import standardizer -from core.utils.date_utils import extract_minute_second_key, truncate_datetime_to_hour - - -def generate_user_session_id(client_name, client_version, ip_address, datetime, sep='|'): - """ - Generates a user session ID based on the provided parameters. - - Parameters: - client_name (str): The name of the client. - client_version (str): The version of the client. - ip_address (str): The IP address of the user. - datetime (datetime): The datetime object representing the session time. - sep (str): The separator to use in the ID. Default is '|'. - - Returns: - str: A user session ID formatted as a string. - """ - dt_year_month_day = datetime.strftime('%Y-%m-%d') - dt_hour = datetime.strftime('%H') - - return sep.join([ - str(client_name), - str(client_version), - str(ip_address), - str(dt_year_month_day), - str(dt_hour), - ]) - - -def generate_item_access_id(col_acron3, scielo_issn, pid_v2, pid_v3, pid_generic, user_session_id, country_code, media_language, media_format, content_type, sep='|'): - """ - Generates an item access ID based on the provided parameters. - - Parameters: - col_acron3 (str): The acronym of the collection. - scielo_issn (str): The ISSN of the SciELO journal. - pid_v2 (str): The PID version 2. - pid_v3 (str): The PID version 3. - pid_generic (str): The generic PID. - user_session_id (str): The user session ID. - country_code (str): The country code of the user. - media_language (str): The language of the media. - media_format (str): The format of the media. - content_type (str): The type of content. - sep (str): The separator to use in the ID. Default is '|'. - """ - return sep.join([ - col_acron3, - scielo_issn, - pid_v2 or '', - pid_v3 or '', - pid_generic or '', - user_session_id, - country_code, - media_language, - media_format, - content_type, - ]) - - -def generate_index_name(index_prefix: str, collection: str, date: str): - """ Generates an index name based on the provided parameters. - Parameters: - index_prefix (str): The prefix for the index name. - collection (str): The collection acronym. - date (str): The date string in 'YYYY-MM-DD' format. - Returns: - str: The formatted index name. - """ - if not date or not isinstance(date, str): - raise ValueError("Date must be a non-empty string in 'YYYY-MM-DD' format.") - - if not collection or not isinstance(collection, str): - raise ValueError("Collection must be a non-empty string.") - - if not index_prefix or not isinstance(index_prefix, str): - raise ValueError("Index prefix must be a non-empty string.") - - index_year, _, _ = date.split('-') - return f'{index_prefix}_{collection}_{index_year}' - - -def generate_index_id(collection, journal, pid_v2, pid_v3, pid_generic, media_language, country_code, date_str): - """ - Generates a unique index key based on the provided parameters. - This is different from the item access ID as it does not include user session, media_format, and content_type information. - It is used for indexing purposes. - - Parameters: - collection (str): The collection acronym. - journal (str): The journal name. - pid_v2 (str): The PID version 2. - pid_v3 (str): The PID version 3. - pid_generic (str): The generic PID. - media_language (str): The media language code. - country_code (str): The country code. - date_str (str): The date string in 'YYYY-MM-DD' format. - - Returns: - str: A unique index key formatted as a string. - """ - return '|'.join([ - collection, - journal, - pid_v2 or '', - pid_v3 or '', - pid_generic or '', - media_language, - country_code, - date_str - ]) - - -def extract_item_access_data(collection_acron3:str, translated_url: dict): - """ - Extracts item access data from the translated URL and standardizes it. - - Args: - collection_acron3 (str): The acronym of the collection. - translated_url (dict): The translated URL containing metadata. - - Returns: - dict: A dictionary containing standardized item access data, or None if the data is invalid. - """ - if not translated_url or not isinstance(translated_url, dict): - return {} - - item_access_data = { - 'collection': collection_acron3, - 'scielo_issn': translated_url.get('scielo_issn'), - 'pid_v2': standardizer.standardize_pid_v2(translated_url.get('pid_v2')), - 'pid_v3': standardizer.standardize_pid_v3(translated_url.get('pid_v3')), - 'pid_generic': standardizer.standardize_pid_generic(translated_url.get('pid_generic')), - 'media_language': standardizer.standardize_language_code(translated_url.get('media_language')), - 'media_format': translated_url.get('media_format'), - 'content_type': translated_url.get('content_type'), - 'year_of_publication': standardizer.standardize_year_of_publication(translated_url.get('year_of_publication')), - 'journal_main_title': translated_url.get('journal_main_title'), - 'journal_subject_area_capes': translated_url.get('journal_subject_area_capes'), - 'journal_subject_area_wos': translated_url.get('journal_subject_area_wos'), - 'journal_acronym': translated_url.get('journal_acronym'), - 'journal_publisher_name': translated_url.get('journal_publisher_name'), - } - - return item_access_data - - -def is_valid_item_access_data(data: dict, utm=None, ignore_utm_validation=False): - """ - Validates the item access data based on the provided parameters. - - Parameters: - data (dict): A dictionary containing the following keys: - - scielo_issn (str): The ISSN of the SciELO journal. - - pid_v2 (str): The PID version 2 of the document. - - pid_v3 (str): The PID version 3 of the document. - - media_format (str): The media format of the document. - - content_type (str): The content type of the document. - utm: URL translation manager for converting URLs - ignore_utm_validation (bool): If True, skips validation against the URL translation manager. - - Returns: - tuple: A tuple containing a boolean indicating whether the data is valid and a message. - If the data is valid, the first element is True and the second element is a success message. - If the data is invalid, the first element is False and the second element is an error message. - """ - if not isinstance(data, dict): - return False, {'message': 'Invalid data format. Expected a dictionary.', 'code': 'invalid_format'} - - scielo_issn = data.get('scielo_issn') - media_format = data.get('media_format') - content_type = data.get('content_type') - pid_v2 = data.get('pid_v2') - pid_v3 = data.get('pid_v3') - pid_generic = data.get('pid_generic') - - if not all([ - scielo_issn, - media_format and media_format != MEDIA_FORMAT_UNDEFINED, - content_type and content_type != CONTENT_TYPE_UNDEFINED, - pid_v2 or pid_v3 or pid_generic, - ]): - return False, {'message': 'Missing required fields in item access data.', 'code': 'missing_fields'} - - # Check ISSN and PIDs validity using the URL translation manager - if utm and not ignore_utm_validation: - if not utm.is_valid_code(scielo_issn, utm.journals_metadata['issn_set']): - return False, {'message': f'Invalid scielo_issn: {scielo_issn}', 'code': 'invalid_scielo_issn'} - - if pid_v2 and not utm.is_valid_code(pid_v2, utm.articles_metadata['pid_set']): - return False, {'message': f'Invalid pid_v2: {pid_v2}', 'code': 'invalid_pid_v2'} - - if pid_v3 and not utm.is_valid_code(pid_v3, utm.articles_metadata['pid_set']): - return False, {'message': f'Invalid pid_v3: {pid_v3}', 'code': 'invalid_pid_v3'} - - if pid_generic and not utm.is_valid_code(pid_generic, utm.articles_metadata['pid_set']): - return False, {'message': f'Invalid pid_generic: {pid_generic}', 'code': 'invalid_pid_generic'} - - return True, {'message': 'Item access data is valid.', 'code': 'valid'} - - -def update_results_with_item_access_data(results: dict, item_access_data: dict, line: dict): - """ - Updates the item access data with the information from the log line. - - Args: - data (dict): The dictionary to store item access data. - item_access_data (dict): The item access data extracted from the translated URL. - line (dict): The log line containing additional information. - - Returns: - None. - """ - col_acron3 = item_access_data.get('collection') - scielo_issn = item_access_data.get('scielo_issn') - pid_v2 = item_access_data.get('pid_v2') - pid_v3 = item_access_data.get('pid_v3') - pid_generic = item_access_data.get('pid_generic') - - media_format = item_access_data.get('media_format') - media_language = item_access_data.get('media_language') - content_type = item_access_data.get('content_type') - - client_name = line.get('client_name') - client_version = line.get('client_version') - local_datetime = line.get('local_datetime') - country_code = line.get('country_code') - ip_address = line.get('ip_address') - - truncated_datetime = truncate_datetime_to_hour(local_datetime) - ms_key = extract_minute_second_key(local_datetime) - - user_session_id = generate_user_session_id( - client_name, - client_version, - ip_address, - truncated_datetime, - ) - - item_access_id = generate_item_access_id( - user_session_id=user_session_id, - col_acron3=col_acron3, - scielo_issn=scielo_issn, - pid_v2=pid_v2, - pid_v3=pid_v3, - pid_generic=pid_generic, - media_language=media_language, - country_code=country_code, - media_format=media_format, - content_type=content_type, - ) - - if item_access_id not in results: - results[item_access_id] = { - 'click_timestamps': {ms_key: 0}, - 'media_format': media_format, - 'media_language': media_language, - 'content_type': content_type, - 'country_code': country_code, - 'date_str': truncated_datetime.strftime('%Y-%m-%d'), - 'date': truncated_datetime, - 'year_of_publication': item_access_data.get('year_of_publication'), - 'journal': { - 'scielo_issn': item_access_data.get('scielo_issn'), - 'main_title': item_access_data.get('journal_main_title'), - 'subject_area_capes': item_access_data.get('journal_subject_area_capes'), - 'subject_area_wos': item_access_data.get('journal_subject_area_wos'), - 'acronym': item_access_data.get('journal_acronym'), - 'publisher_name': item_access_data.get('journal_publisher_name'), - }, - } - - # Check if the click timestamp for this minute-second key exists, if not, initialize it - if ms_key not in results[item_access_id]['click_timestamps']: - results[item_access_id]['click_timestamps'][ms_key] = 0 - - # Increment the click timestamp count - results[item_access_id]['click_timestamps'][ms_key] += 1 - - -def convert_to_index_documents(data: dict, key_sep='|'): - """ - Converts the provided data into a format suitable for indexing metrics. - This function processes the data dictionary, extracting relevant fields and computing metrics. - - Args: - data (dict): A dictionary containing the metrics data to be processed. - - Returns: - dict: A dictionary containing the processed metrics data, ready for indexing. - """ - if not isinstance(data, dict): - return {} - - metrics_data = {} - - for key, value in data.items(): - collection, scielo_issn, pid_v2, pid_v3, pid_generic, _, _, _, _, _, country_code, media_language, _, content_type = key.split(key_sep) - - document_id = generate_index_id( - collection, - scielo_issn, - pid_v2, - pid_v3, - pid_generic, - media_language, - country_code, - value.get('date_str') - ) - - compute_r5_metrics( - document_id, - metrics_data, - collection, - value.get('journal'), - pid_v2, - pid_v3, - pid_generic, - value.get('year_of_publication'), - media_language, - value.get('country_code'), - value.get('date_str'), - value.get('click_timestamps'), - content_type, - ) - - return metrics_data diff --git a/metrics/wagtail_hooks.py b/metrics/wagtail_hooks.py new file mode 100644 index 0000000..94c2ffb --- /dev/null +++ b/metrics/wagtail_hooks.py @@ -0,0 +1,22 @@ +from django.utils.translation import gettext_lazy as _ +from wagtail.snippets.views.snippets import SnippetViewSet + +from metrics.models import DailyMetricJob + +class DailyMetricJobSnippetViewSet(SnippetViewSet): + model = DailyMetricJob + menu_label = _("Daily Metric Jobs") + icon = "history" + menu_order = 600 + list_display = ( + "collection", + "access_date", + "status", + "input_log_count", + "attempts", + "export_started_at", + "exported_at", + "updated", + ) + list_filter = ("status", "collection", "access_date") + search_fields = ("collection__acron3", "error_message") diff --git a/article/management/__init__.py b/reports/__init__.py similarity index 100% rename from article/management/__init__.py rename to reports/__init__.py diff --git a/reports/apps.py b/reports/apps.py new file mode 100644 index 0000000..119ca26 --- /dev/null +++ b/reports/apps.py @@ -0,0 +1,8 @@ +from django.apps import AppConfig +from django.utils.translation import gettext_lazy as _ + + +class ReportsConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "reports" + verbose_name = _("Reports") diff --git a/reports/migrations/0001_initial.py b/reports/migrations/0001_initial.py new file mode 100644 index 0000000..2a72923 --- /dev/null +++ b/reports/migrations/0001_initial.py @@ -0,0 +1,140 @@ +# Generated by Django 5.2.12 on 2026-05-01 15:50 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + initial = True + + dependencies = [ + ("collection", "0001_initial"), + ] + + operations = [ + migrations.CreateModel( + name="MonthlyLogReport", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("total_files", models.IntegerField(default=0)), + ("created_files", models.IntegerField(default=0)), + ("validated_files", models.IntegerField(default=0)), + ("invalidated_files", models.IntegerField(default=0)), + ("errored_files", models.IntegerField(default=0)), + ("lines_parsed", models.IntegerField(default=0)), + ("valid_lines", models.IntegerField(default=0)), + ("discarded_lines", models.IntegerField(default=0)), + ("ip_local_count", models.IntegerField(default=0)), + ("ip_remote_count", models.IntegerField(default=0)), + ("ip_unknown_count", models.IntegerField(default=0)), + ("generated_at", models.DateTimeField(auto_now=True)), + ("year", models.IntegerField(verbose_name="Year")), + ("month", models.IntegerField(verbose_name="Month")), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="collection.collection", + verbose_name="Collection", + ), + ), + ], + options={ + "verbose_name": "Monthly Log Report", + "verbose_name_plural": "Monthly Log Reports", + "ordering": ["-year", "-month", "collection__acron3"], + "unique_together": {("collection", "year", "month")}, + }, + ), + migrations.CreateModel( + name="WeeklyLogReport", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("total_files", models.IntegerField(default=0)), + ("created_files", models.IntegerField(default=0)), + ("validated_files", models.IntegerField(default=0)), + ("invalidated_files", models.IntegerField(default=0)), + ("errored_files", models.IntegerField(default=0)), + ("lines_parsed", models.IntegerField(default=0)), + ("valid_lines", models.IntegerField(default=0)), + ("discarded_lines", models.IntegerField(default=0)), + ("ip_local_count", models.IntegerField(default=0)), + ("ip_remote_count", models.IntegerField(default=0)), + ("ip_unknown_count", models.IntegerField(default=0)), + ("generated_at", models.DateTimeField(auto_now=True)), + ("year", models.IntegerField(verbose_name="Year")), + ("week", models.IntegerField(verbose_name="ISO Week")), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="collection.collection", + verbose_name="Collection", + ), + ), + ], + options={ + "verbose_name": "Weekly Log Report", + "verbose_name_plural": "Weekly Log Reports", + "ordering": ["-year", "-week", "collection__acron3"], + "unique_together": {("collection", "year", "week")}, + }, + ), + migrations.CreateModel( + name="YearlyLogReport", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("total_files", models.IntegerField(default=0)), + ("created_files", models.IntegerField(default=0)), + ("validated_files", models.IntegerField(default=0)), + ("invalidated_files", models.IntegerField(default=0)), + ("errored_files", models.IntegerField(default=0)), + ("lines_parsed", models.IntegerField(default=0)), + ("valid_lines", models.IntegerField(default=0)), + ("discarded_lines", models.IntegerField(default=0)), + ("ip_local_count", models.IntegerField(default=0)), + ("ip_remote_count", models.IntegerField(default=0)), + ("ip_unknown_count", models.IntegerField(default=0)), + ("generated_at", models.DateTimeField(auto_now=True)), + ("year", models.IntegerField(verbose_name="Year")), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="collection.collection", + verbose_name="Collection", + ), + ), + ], + options={ + "verbose_name": "Yearly Log Report", + "verbose_name_plural": "Yearly Log Reports", + "ordering": ["-year", "collection__acron3"], + "unique_together": {("collection", "year")}, + }, + ), + ] diff --git a/reports/migrations/0002_alter_monthlylogreport_options_and_more.py b/reports/migrations/0002_alter_monthlylogreport_options_and_more.py new file mode 100644 index 0000000..659215c --- /dev/null +++ b/reports/migrations/0002_alter_monthlylogreport_options_and_more.py @@ -0,0 +1,36 @@ +# Generated by Django 5.2.12 on 2026-05-01 22:23 + +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("reports", "0001_initial"), + ] + + operations = [ + migrations.AlterModelOptions( + name="monthlylogreport", + options={ + "ordering": ["collection__acron3", "year", "month"], + "verbose_name": "Monthly Log Report", + "verbose_name_plural": "Monthly Log Reports", + }, + ), + migrations.AlterModelOptions( + name="weeklylogreport", + options={ + "ordering": ["collection__acron3", "year", "week"], + "verbose_name": "Weekly Log Report", + "verbose_name_plural": "Weekly Log Reports", + }, + ), + migrations.AlterModelOptions( + name="yearlylogreport", + options={ + "ordering": ["collection__acron3", "year"], + "verbose_name": "Yearly Log Report", + "verbose_name_plural": "Yearly Log Reports", + }, + ), + ] diff --git a/article/management/commands/__init__.py b/reports/migrations/__init__.py similarity index 100% rename from article/management/commands/__init__.py rename to reports/migrations/__init__.py diff --git a/reports/models.py b/reports/models.py new file mode 100644 index 0000000..3af1ec8 --- /dev/null +++ b/reports/models.py @@ -0,0 +1,100 @@ +from django.db import models +from django.utils.translation import gettext_lazy as _ + +from collection.models import Collection + + +class AbstractLogReport(models.Model): + collection = models.ForeignKey( + Collection, + on_delete=models.CASCADE, + verbose_name=_("Collection"), + ) + total_files = models.IntegerField(default=0) + created_files = models.IntegerField(default=0) + validated_files = models.IntegerField(default=0) + invalidated_files = models.IntegerField(default=0) + errored_files = models.IntegerField(default=0) + lines_parsed = models.IntegerField(default=0) + valid_lines = models.IntegerField(default=0) + discarded_lines = models.IntegerField(default=0) + ip_local_count = models.IntegerField(default=0) + ip_remote_count = models.IntegerField(default=0) + ip_unknown_count = models.IntegerField(default=0) + generated_at = models.DateTimeField(auto_now=True) + + class Meta: + abstract = True + + @property + def pct_validated(self): + if not self.total_files: + return 0 + return round(self.validated_files / self.total_files * 100, 1) + pct_validated.fget.short_description = _("% Valid Files") + + @property + def pct_valid_lines(self): + if not self.lines_parsed: + return 0 + return round(self.valid_lines / self.lines_parsed * 100, 1) + pct_valid_lines.fget.short_description = _("% Valid Lines") + + @property + def pct_remote_ip(self): + total = self.ip_remote_count + self.ip_local_count + if not total: + return 0 + return round(self.ip_remote_count / total * 100, 1) + pct_remote_ip.fget.short_description = _("% Remote IP") + + def __str__(self): + return f"{self.collection.acron3} {self.period_label}" + + @property + def period_label(self): + raise NotImplementedError + + +class WeeklyLogReport(AbstractLogReport): + year = models.IntegerField(verbose_name=_("Year")) + week = models.IntegerField(verbose_name=_("ISO Week")) + + class Meta: + unique_together = [("collection", "year", "week")] + ordering = ["collection__acron3", "year", "week"] + verbose_name = _("Weekly Log Report") + verbose_name_plural = _("Weekly Log Reports") + + @property + def period_label(self): + return f"{self.year}-W{self.week:02d}" + + +class MonthlyLogReport(AbstractLogReport): + year = models.IntegerField(verbose_name=_("Year")) + month = models.IntegerField(verbose_name=_("Month")) + + class Meta: + unique_together = [("collection", "year", "month")] + ordering = ["collection__acron3", "year", "month"] + verbose_name = _("Monthly Log Report") + verbose_name_plural = _("Monthly Log Reports") + + @property + def period_label(self): + return f"{self.year}-{self.month:02d}" + + +class YearlyLogReport(AbstractLogReport): + year = models.IntegerField(verbose_name=_("Year")) + + class Meta: + unique_together = [("collection", "year")] + ordering = ["collection__acron3", "year"] + verbose_name = _("Yearly Log Report") + verbose_name_plural = _("Yearly Log Reports") + + @property + def period_label(self): + return str(self.year) diff --git a/reports/tasks.py b/reports/tasks.py new file mode 100644 index 0000000..69a53a1 --- /dev/null +++ b/reports/tasks.py @@ -0,0 +1,238 @@ +import logging +from collections import defaultdict + +from django.core.mail import send_mail +from django.conf import settings +from django.utils.translation import gettext as _ + +from config import celery_app +from core.utils import date_utils +from collection.models import Collection +from log_manager import choices +from log_manager.models import LogFile +from log_manager_config import models as lmc_models + +from reports.models import WeeklyLogReport, MonthlyLogReport, YearlyLogReport + + +def _extract_date_from_log_file(lf): + if lf.date: + return lf.date + + probably_date = (lf.validation or {}).get("probably_date") + if isinstance(probably_date, str) and probably_date: + return date_utils.get_date_obj(probably_date) + + try: + import re + match = re.search(r"(\d{4}-\d{2}-\d{2})", lf.path) + if match: + return date_utils.get_date_obj(match.group(1)) + except Exception: + pass + + return None + + +@celery_app.task(bind=True, name=_("[Reports] Populate All Reports")) +def task_populate_all_reports(self, year=None, collection_acron=None): + qs = LogFile.objects.select_related("collection") + if collection_acron: + qs = qs.filter(collection__acron3=collection_acron) + qs = qs.only( + "id", "collection_id", "date", "path", "status", "summary", "validation" + ) + + weekly = defaultdict(lambda: defaultdict(int)) + monthly = defaultdict(lambda: defaultdict(int)) + yearly = defaultdict(lambda: defaultdict(int)) + + for lf in qs.iterator(chunk_size=2000): + extracted_date = _extract_date_from_log_file(lf) + if not extracted_date: + continue + if year and extracted_date.year != int(year): + continue + + iso_year, iso_week, _ = extracted_date.isocalendar() + yr = extracted_date.year + mo = extracted_date.month + + for agg, key in [ + (weekly, (lf.collection_id, iso_year, iso_week)), + (monthly, (lf.collection_id, yr, mo)), + (yearly, (lf.collection_id, yr)), + ]: + r = agg[key] + r["total_files"] += 1 + st = lf.status + if st == "CRE": + r["created_files"] += 1 + elif st in ("QUE", "PAR", "PRO"): + r["validated_files"] += 1 + elif st == "INV": + r["invalidated_files"] += 1 + elif st == "ERR": + r["errored_files"] += 1 + + s = lf.summary or {} + lp = s.get("lines_parsed", 0) or 0 + vl = s.get("valid_lines", 0) or 0 + r["lines_parsed"] += lp + r["valid_lines"] += vl + r["discarded_lines"] += max(lp - vl, 0) + + ips = ( + (lf.validation or {}) + .get("content", {}) + .get("summary", {}) + .get("ips", {}) + ) + r["ip_local_count"] += ips.get("local", 0) or 0 + r["ip_remote_count"] += ips.get("remote", 0) or 0 + r["ip_unknown_count"] += ips.get("unknown", 0) or 0 + + w_count = _upsert_reports(WeeklyLogReport, weekly) + m_count = _upsert_reports(MonthlyLogReport, monthly) + y_count = _upsert_reports(YearlyLogReport, yearly) + + logging.info( + "Reports populated: %s weekly, %s monthly, %s yearly.", + w_count, m_count, y_count, + ) + return f"Weekly: {w_count}, Monthly: {m_count}, Yearly: {y_count}" + + +def _upsert_reports(model_class, data): + count = 0 + unique_fields = list(model_class._meta.unique_together[0]) + period_fields = unique_fields[1:] + for key, fields in data.items(): + coll_id = key[0] + period_values = key[1:] + lookup = {"collection_id": coll_id} + for idx, field_name in enumerate(period_fields): + lookup[field_name] = period_values[idx] + model_class.objects.update_or_create(defaults=fields, **lookup) + count += 1 + return count + + +@celery_app.task( + bind=True, + name=_("[Reports] Generate Log Report Summary (Manual)"), + queue="load", +) +def task_log_files_count_status_report( + self, + collections=None, + from_date=None, + until_date=None, + days_to_go_back=None, + user_id=None, + username=None, +): + from_date_str, until_date_str = date_utils.get_date_range_str( + from_date, until_date, days_to_go_back + ) + subject = _( + "Usage Log Report Summary " + f"({from_date_str} to {until_date_str})" + ) + + for collection_acron in (collections or Collection.acron3_list()): + try: + collection = Collection.objects.get(acron3=collection_acron) + except Collection.DoesNotExist: + logging.warning("Collection not found: %s", collection_acron) + continue + + message = _build_report_message( + collection, + from_date_str, + until_date_str, + ) + + if not message: + continue + + logging.info( + "Sending email to collection %s. Subject: %s.", + collection.main_name, subject, + ) + + _send_collection_email(subject, message, collection_acron) + + +def _build_report_message(collection, from_date_str, until_date_str): + monthly = MonthlyLogReport.objects.filter( + collection=collection, + ).order_by("-year", "-month") + + if not monthly.exists(): + return "" + + latest = monthly.first() + message = _( + f"Usage Log Report for {collection.acron3}\n" + f"Period: {from_date_str} to {until_date_str}\n\n" + ) + message += _("Latest month ({latest}):\n").format(latest=latest.period_label) + message += ( + f" Total files: {latest.total_files}\n" + f" Validated files: {latest.validated_files} ({latest.pct_validated}%)\n" + f" Invalidated files: {latest.invalidated_files}\n" + f" Errored files: {latest.errored_files}\n" + f" Lines parsed: {latest.lines_parsed}\n" + f" Valid lines: {latest.valid_lines} ({latest.pct_valid_lines}%)\n" + f" Discarded lines: {latest.discarded_lines}\n" + f" Remote IPs: {latest.ip_remote_count} ({latest.pct_remote_ip}%)\n" + f" Local IPs: {latest.ip_local_count}\n" + ) + + prev_month = latest + if len(monthly) > 1: + prev_month = monthly[1] + message += _("\nPrevious month ({prev}):\n").format(prev=prev_month.period_label) + message += ( + f" Total files: {prev_month.total_files}\n" + f" Validated files: {prev_month.validated_files} ({prev_month.pct_validated}%)\n" + f" Valid lines: {prev_month.valid_lines} ({prev_month.pct_valid_lines}%)\n" + f" Remote IPs: {prev_month.ip_remote_count} ({prev_month.pct_remote_ip}%)\n" + ) + + if prev_month.total_files: + file_diff = latest.total_files - prev_month.total_files + line_diff = latest.lines_parsed - prev_month.lines_parsed + message += _("\nMonth-over-month change:\n") + message += f" Files: {file_diff:+d}\n" + message += f" Lines: {line_diff:+d}\n" + + message += ( + f"\n---\n" + f"This report is automatically generated by SciELO Usage.\n" + ) + return message + + +def _send_collection_email(subject, message, collection): + emails = lmc_models.CollectionEmail.objects.filter( + config__collection__acron3=collection, active=True + ).values_list("email", flat=True) + + if not emails: + logging.error( + "Error. Please, add an E-mail Configuration for the collection %s.", + collection, + ) + return + + try: + send_mail( + subject=subject, + message=message, + from_email=settings.DEFAULT_FROM_EMAIL, + recipient_list=list(emails), + ) + except Exception as e: + logging.error("Error sending log files report for %s: %s", collection, e) diff --git a/reports/wagtail_hooks.py b/reports/wagtail_hooks.py new file mode 100644 index 0000000..b2aeac7 --- /dev/null +++ b/reports/wagtail_hooks.py @@ -0,0 +1,75 @@ +from django.contrib.auth import get_user_model +from django.utils.translation import gettext_lazy as _ +from wagtail.snippets.views.snippets import SnippetViewSet, SnippetViewSetGroup +from wagtail.snippets.models import register_snippet +from wagtail.permission_policies.base import BasePermissionPolicy + +from reports.models import WeeklyLogReport, MonthlyLogReport, YearlyLogReport + + +class ReadOnlyPermissionPolicy(BasePermissionPolicy): + def user_has_permission(self, user, action): + if action in ("add", "change", "delete"): + return False + return True + + def users_with_any_permission(self, actions): + return get_user_model().objects.filter(is_active=True) + + +COMMON_LIST_DISPLAY = ( + "total_files", + "pct_validated", + "lines_parsed", + "pct_valid_lines", + "pct_remote_ip", + "generated_at", +) + + +class WeeklyLogReportSnippetViewSet(SnippetViewSet): + model = WeeklyLogReport + menu_label = _("Weekly") + icon = "info-circle" + menu_order = 100 + list_display = ("collection", "year", "week") + COMMON_LIST_DISPLAY + list_filter = ("collection", "year", "week") + search_fields = ("collection__acron3",) + permission_policy = ReadOnlyPermissionPolicy(WeeklyLogReport) + + +class MonthlyLogReportSnippetViewSet(SnippetViewSet): + model = MonthlyLogReport + menu_label = _("Monthly") + icon = "info-circle" + menu_order = 200 + list_display = ("collection", "year", "month") + COMMON_LIST_DISPLAY + list_filter = ("collection", "year", "month") + search_fields = ("collection__acron3",) + permission_policy = ReadOnlyPermissionPolicy(MonthlyLogReport) + + +class YearlyLogReportSnippetViewSet(SnippetViewSet): + model = YearlyLogReport + menu_label = _("Yearly") + icon = "info-circle" + menu_order = 300 + list_display = ("collection", "year") + COMMON_LIST_DISPLAY + list_filter = ("collection", "year") + search_fields = ("collection__acron3",) + permission_policy = ReadOnlyPermissionPolicy(YearlyLogReport) + + +class ReportsSnippetViewSetGroup(SnippetViewSetGroup): + menu_name = "usage_reports" + menu_label = _("Reports") + menu_icon = "info-circle" + menu_order = 350 + items = ( + WeeklyLogReportSnippetViewSet, + MonthlyLogReportSnippetViewSet, + YearlyLogReportSnippetViewSet, + ) + + +register_snippet(ReportsSnippetViewSetGroup) diff --git a/requirements/base.txt b/requirements/base.txt index 6ef5fba..7b5ed61 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -63,10 +63,13 @@ minio==7.2.7 reverse-geocode==1.6 # https://pypi.org/project/reverse-geocode/ # SciELO Log Validator --e git+https://github.com/scieloorg/scielo_log_validator@0.4.0#egg=scielo_log_validator +-e git+https://github.com/scieloorg/scielo_log_validator@2.0.0#egg=scielo_log_validator + +# SciELO Scholarly Data +-e git+https://github.com/scieloorg/scielo_scholarly_data@v0.1.4#egg=scielo_scholarly_data # SciELO Usage COUNTER --e git+https://github.com/scieloorg/scielo_usage_counter@1.5.1#egg=scielo_usage_counter +-e git+https://github.com/scieloorg/scielo_usage_counter@2.0.0#egg=scielo_usage_counter # Device Detector device-detector==0.10 # https://github.com/thinkwelltwd/device_detector @@ -93,6 +96,6 @@ tenacity==8.3.0 # https://pypi.org/project/tenacity/ # ------------------------------------------------------------------------------ articlemetaapi==1.26.7 -# ElasticSearch +# OpenSearch # ------------------------------------------------------------------------------ -elasticsearch==8.18.1 # https://elasticsearch-py.readthedocs.io/en/v8.18.1/ +opensearch-py==3.1.0 diff --git a/resources/constants.py b/resources/constants.py index feba18d..2ce64da 100644 --- a/resources/constants.py +++ b/resources/constants.py @@ -1,2 +1,2 @@ DEFAULT_COUNTER_ROBOTS_URL = 'https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json' -DEFAULT_MMDB_URL = 'https://download.db-ip.com/free/dbip-country-lite-2025-02.mmdb.gz' +DEFAULT_MMDB_URL = 'https://download.db-ip.com/free/dbip-country-lite-2026-03.mmdb.gz' diff --git a/resources/migrations/0002_remove_mmdb_creator_remove_mmdb_updated_by_and_more.py b/resources/migrations/0002_remove_mmdb_creator_remove_mmdb_updated_by_and_more.py new file mode 100644 index 0000000..80bb0cc --- /dev/null +++ b/resources/migrations/0002_remove_mmdb_creator_remove_mmdb_updated_by_and_more.py @@ -0,0 +1,61 @@ +# Generated by Django 5.2.12 on 2026-05-01 22:23 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("resources", "0001_initial"), + ] + + operations = [ + migrations.RemoveField( + model_name="mmdb", + name="creator", + ), + migrations.RemoveField( + model_name="mmdb", + name="updated_by", + ), + migrations.RemoveField( + model_name="robotuseragent", + name="creator", + ), + migrations.RemoveField( + model_name="robotuseragent", + name="updated_by", + ), + migrations.AddField( + model_name="robotuseragent", + name="is_active", + field=models.BooleanField( + db_index=True, default=True, verbose_name="Active" + ), + ), + migrations.AddField( + model_name="robotuseragent", + name="source_counter", + field=models.BooleanField( + db_index=True, default=False, verbose_name="From Atmire/COUNTER" + ), + ), + migrations.AddField( + model_name="robotuseragent", + name="source_scielo", + field=models.BooleanField( + db_index=True, default=False, verbose_name="From SciELO" + ), + ), + migrations.AddField( + model_name="robotuseragent", + name="source_url", + field=models.URLField( + blank=True, max_length=255, null=True, verbose_name="Source URL" + ), + ), + migrations.AlterField( + model_name="robotuseragent", + name="last_changed", + field=models.DateField(blank=True, null=True, verbose_name="Last Changed"), + ), + ] diff --git a/resources/models.py b/resources/models.py index a30b8d3..22663e2 100644 --- a/resources/models.py +++ b/resources/models.py @@ -2,11 +2,26 @@ from django.db import models from django.utils.translation import gettext_lazy as _ +from wagtail.admin.panels import FieldPanel -from core.models import CommonControlField +class RobotUserAgent(models.Model): + SOURCE_ALL = "all" + SOURCE_COUNTER = "counter" + SOURCE_SCIELO = "scielo" + SOURCE_CHOICES = [SOURCE_ALL, SOURCE_COUNTER, SOURCE_SCIELO] + panels = [ + FieldPanel("pattern"), + FieldPanel("source_counter"), + FieldPanel("source_scielo"), + FieldPanel("is_active"), + FieldPanel("source_url"), + FieldPanel("last_changed"), + ] + + created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True) + updated = models.DateTimeField(verbose_name=_("Last update date"), auto_now=True) -class RobotUserAgent(CommonControlField): pattern = models.CharField( verbose_name=_('Pattern'), max_length=255, @@ -14,21 +29,77 @@ class RobotUserAgent(CommonControlField): blank=False, primary_key=True, ) + source_counter = models.BooleanField( + verbose_name=_("From Atmire/COUNTER"), + default=False, + db_index=True, + ) + source_scielo = models.BooleanField( + verbose_name=_("From SciELO"), + default=False, + db_index=True, + ) + is_active = models.BooleanField( + verbose_name=_("Active"), + default=True, + db_index=True, + ) + source_url = models.URLField( + verbose_name=_("Source URL"), + max_length=255, + null=True, + blank=True, + ) last_changed = models.DateField( verbose_name=_('Last Changed'), - null=False, - blank=False, + null=True, + blank=True, ) @classmethod def get_all_patterns(cls): - return cls.objects.values_list('pattern', flat=True) + return cls.get_patterns(source=cls.SOURCE_ALL) + + @classmethod + def normalize_source(cls, source=None): + normalized = (source or cls.SOURCE_ALL).lower() + if normalized not in cls.SOURCE_CHOICES: + raise ValueError(f"Unsupported robots source: {source}") + return normalized + + @classmethod + def get_patterns(cls, source=None): + source = cls.normalize_source(source) + queryset = cls.objects.filter(is_active=True) + + if source == cls.SOURCE_COUNTER: + queryset = queryset.filter(source_counter=True) + elif source == cls.SOURCE_SCIELO: + queryset = queryset.filter(source_scielo=True) + + return queryset.values_list("pattern", flat=True) + + @property + def source_labels(self): + labels = [] + if self.source_counter: + labels.append("Atmire/COUNTER") + if self.source_scielo: + labels.append("SciELO") + return ", ".join(labels) or "-" + + def save(self, *args, **kwargs): + if not self.source_counter and not self.source_scielo: + self.source_scielo = True + super().save(*args, **kwargs) def __str__(self): return self.pattern -class MMDB(CommonControlField): +class MMDB(models.Model): + created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True) + updated = models.DateTimeField(verbose_name=_("Last update date"), auto_now=True) id = models.CharField( verbose_name=_('ID (HASH)'), max_length=64, diff --git a/resources/tasks.py b/resources/tasks.py index e67cea1..4df60a9 100644 --- a/resources/tasks.py +++ b/resources/tasks.py @@ -1,19 +1,13 @@ import logging -from django.contrib.auth import get_user_model -from django.utils import timezone from django.utils.translation import gettext as _ from config import celery_app -from core.utils.utils import _get_user from . import constants, models, utils - -User = get_user_model() - -@celery_app.task(bind=True, name=_('Load robots data')) -def task_load_robots(self, url_robots=None, user_id=None, username=None): +@celery_app.task(bind=True, name=_('[Resources] Load Robots Data')) +def task_load_robots(self, url_robots=None): """ Load robots from a given URL and save them to the database. This function fetches robot data from a specified URL (or a default URL if none is provided), @@ -32,8 +26,6 @@ def task_load_robots(self, url_robots=None, user_id=None, username=None): - Error if there is an issue downloading or saving the robots. - Debug information for each robot saved. """ - user = _get_user(self.request, username=username, user_id=user_id) - if not url_robots: url_robots = constants.DEFAULT_COUNTER_ROBOTS_URL logging.warning(f'No robots URL provided. Using default: {url_robots}') @@ -45,43 +37,63 @@ def task_load_robots(self, url_robots=None, user_id=None, username=None): return False cleaned_robots_data = utils.clean_robots_list(robots_data) + fetched_patterns = set() try: for r_str in cleaned_robots_data: pattern = r_str.get('pattern') last_changed = r_str.get('last_changed') + fetched_patterns.add(pattern) - r_obj, created = models.RobotUserAgent.objects.get_or_create(pattern=pattern, last_changed=last_changed) + r_obj = models.RobotUserAgent.objects.filter(pattern=pattern).first() + created = r_obj is None if created: - r_obj.creator = user - - r_obj.updated = timezone.now() - r_obj.updated_by = user + r_obj = models.RobotUserAgent( + pattern=pattern, + source_counter=True, + source_scielo=False, + ) + r_obj.source_counter = True + r_obj.is_active = True + r_obj.source_url = url_robots + r_obj.last_changed = last_changed r_obj.save() logging.debug(f'Robot saved: {r_obj}') + + stale_counter_patterns = models.RobotUserAgent.objects.filter( + source_counter=True + ).exclude(pattern__in=fetched_patterns) + + for r_obj in stale_counter_patterns: + r_obj.source_counter = False + r_obj.source_url = None + r_obj.last_changed = None + if not r_obj.source_scielo: + r_obj.is_active = False + r_obj.save() + logging.debug(f'Robot deactivated or detached from COUNTER source: {r_obj}') + return True except Exception as e: logging.error(f'Error saving robots: {e}') + return False -@celery_app.task(bind=True, name=_('Load geolocation and country data')) -def task_load_geoip(self, url_geoip=None, user_id=None, username=None, validate=True): +@celery_app.task(bind=True, name=_('[Resources] Load Geolocation Data')) +def task_load_geoip(self, url_geoip=None, validate=True): """ Load GeoIP data from a specified URL, validate it, and save it to the database. Args: url_geoip (str, optional): The URL to download the GeoIP data from. Defaults to None. - user_id (int, optional): The ID of the user performing the task. Defaults to None. - username (str, optional): The username of the user performing the task. Defaults to None. validate (bool, optional): Whether to validate the GeoIP data. Defaults to True. Returns: bool: True if the GeoIP data was successfully loaded and saved, False otherwise. Raises: Exception: If there is an error downloading, decompressing, or validating the GeoIP data. """ - user = _get_user(self.request, username=username, user_id=user_id) if not url_geoip: url_geoip = constants.DEFAULT_MMDB_URL @@ -115,10 +127,6 @@ def task_load_geoip(self, url_geoip=None, user_id=None, username=None, validate= except models.MMDB.DoesNotExist: mmdb_obj = models.MMDB.objects.create(id=mmdb_hash, data=mmdb_data) mmdb_obj.url = url_geoip or constants.DEFAULT_MMDB_URL - mmdb_obj.creator = user - - mmdb_obj.updated = timezone.now() - mmdb_obj.updated_by = user mmdb_obj.save() logging.debug(f'GeoIP data has been saved: {mmdb_obj}') diff --git a/resources/tests.py b/resources/tests.py deleted file mode 100644 index 7ce503c..0000000 --- a/resources/tests.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.test import TestCase - -# Create your tests here. diff --git a/resources/tests/__init__.py b/resources/tests/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/resources/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/resources/tests/test_robots.py b/resources/tests/test_robots.py new file mode 100644 index 0000000..4d6bf74 --- /dev/null +++ b/resources/tests/test_robots.py @@ -0,0 +1,113 @@ +from unittest.mock import patch + +from django.test import TestCase + +from resources import models, tasks + + +class RobotUserAgentModelTests(TestCase): + def test_manual_robot_defaults_to_scielo_source(self): + robot = models.RobotUserAgent.objects.create(pattern="CustomBot") + + robot.refresh_from_db() + + self.assertFalse(robot.source_counter) + self.assertTrue(robot.source_scielo) + self.assertTrue(robot.is_active) + self.assertEqual(robot.source_labels, "SciELO") + + def test_get_all_patterns_only_returns_active_patterns(self): + active = models.RobotUserAgent.objects.create( + pattern="ActiveBot", + source_scielo=True, + is_active=True, + ) + models.RobotUserAgent.objects.create( + pattern="InactiveBot", + source_scielo=True, + is_active=False, + ) + + self.assertListEqual(list(models.RobotUserAgent.get_all_patterns()), [active.pattern]) + + def test_get_patterns_can_filter_by_source(self): + counter_only = models.RobotUserAgent.objects.create( + pattern="CounterOnlyBot", + source_counter=True, + source_scielo=False, + is_active=True, + ) + shared = models.RobotUserAgent.objects.create( + pattern="SharedBot", + source_counter=True, + source_scielo=True, + is_active=True, + ) + scielo_only = models.RobotUserAgent.objects.create( + pattern="ScieloOnlyBot", + source_counter=False, + source_scielo=True, + is_active=True, + ) + + self.assertCountEqual( + list(models.RobotUserAgent.get_patterns(source="counter")), + [counter_only.pattern, shared.pattern], + ) + self.assertCountEqual( + list(models.RobotUserAgent.get_patterns(source="scielo")), + [shared.pattern, scielo_only.pattern], + ) + + def test_get_patterns_rejects_invalid_source(self): + with self.assertRaises(ValueError): + list(models.RobotUserAgent.get_patterns(source="invalid")) + + +class LoadRobotsTaskTests(TestCase): + + @patch("resources.tasks.utils.fetch_data") + def test_task_load_robots_marks_counter_source_and_deactivates_stale_counter_entries( + self, + mock_fetch_data, + ): + mock_fetch_data.return_value = [ + {"pattern": "CounterBot", "last_changed": "2025-01-15"}, + {"pattern": "SharedBot", "last_changed": "2025-01-20"}, + ] + + stale_counter = models.RobotUserAgent.objects.create( + pattern="OldCounterBot", + source_counter=True, + is_active=True, + last_changed="2024-12-01", + source_url="https://old.example.org/robots.json", + ) + shared_bot = models.RobotUserAgent.objects.create( + pattern="SharedBot", + source_scielo=True, + is_active=True, + ) + + result = tasks.task_load_robots.run( + url_robots="https://counter.example.org/robots.json", + ) + + self.assertTrue(result) + + counter_bot = models.RobotUserAgent.objects.get(pattern="CounterBot") + self.assertTrue(counter_bot.source_counter) + self.assertFalse(counter_bot.source_scielo) + self.assertTrue(counter_bot.is_active) + self.assertEqual(counter_bot.source_url, "https://counter.example.org/robots.json") + + shared_bot.refresh_from_db() + self.assertTrue(shared_bot.source_counter) + self.assertTrue(shared_bot.source_scielo) + self.assertTrue(shared_bot.is_active) + + stale_counter.refresh_from_db() + self.assertFalse(stale_counter.source_counter) + self.assertFalse(stale_counter.is_active) + self.assertIsNone(stale_counter.source_url) + self.assertIsNone(stale_counter.last_changed) diff --git a/resources/wagtail_hooks.py b/resources/wagtail_hooks.py index 758bb53..c347b22 100644 --- a/resources/wagtail_hooks.py +++ b/resources/wagtail_hooks.py @@ -15,13 +15,25 @@ class RobotUserAgentSnippetViewSet(SnippetViewSet): list_display = ( "pattern", + "source_labels", + "is_active", "last_changed", ) search_fields = ( "pattern", + "source_url", + ) + list_filter = ( + "source_counter", + "source_scielo", + "is_active", ) list_export = ( "pattern", + "source_counter", + "source_scielo", + "is_active", + "source_url", "last_changed", ) export_filename = "robots" diff --git a/source/__init__.py b/source/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/source/__init__.py @@ -0,0 +1 @@ + diff --git a/article/apps.py b/source/apps.py similarity index 63% rename from article/apps.py rename to source/apps.py index 8c0e2c9..06d886d 100644 --- a/article/apps.py +++ b/source/apps.py @@ -1,6 +1,6 @@ from django.apps import AppConfig -class ArticleConfig(AppConfig): +class SourceConfig(AppConfig): default_auto_field = "django.db.models.BigAutoField" - name = "article" + name = "source" diff --git a/source/migrations/0001_initial.py b/source/migrations/0001_initial.py new file mode 100644 index 0000000..cc736e3 --- /dev/null +++ b/source/migrations/0001_initial.py @@ -0,0 +1,210 @@ +# Generated by Django 5.0.7 on 2026-03-15 00:00 + +import django.db.models.deletion +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + initial = True + + dependencies = [ + ("collection", "0001_initial"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name="Source", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "created", + models.DateTimeField( + auto_now_add=True, + verbose_name="Creation date", + ), + ), + ( + "updated", + models.DateTimeField( + auto_now=True, + verbose_name="Last update date", + ), + ), + ( + "source_type", + models.CharField( + choices=[ + ("journal", "Journal"), + ("book", "Book"), + ("preprint_server", "Preprint Server"), + ("data_repository", "Data Repository"), + ("other", "Other"), + ], + db_index=True, + max_length=32, + verbose_name="Source Type", + ), + ), + ( + "source_id", + models.CharField( + db_index=True, + max_length=255, + verbose_name="Source ID", + ), + ), + ( + "scielo_issn", + models.CharField( + blank=True, + db_index=True, + max_length=9, + null=True, + verbose_name="SciELO ISSN", + ), + ), + ( + "acronym", + models.CharField( + blank=True, + default="", + max_length=64, + null=True, + verbose_name="Source Acronym", + ), + ), + ( + "title", + models.CharField( + max_length=255, + verbose_name="Source Title", + ), + ), + ( + "identifiers", + models.JSONField( + blank=True, + default=dict, + null=True, + verbose_name="Identifiers", + ), + ), + ( + "publisher_name", + models.JSONField( + blank=True, + default=list, + null=True, + verbose_name="Publisher Name", + ), + ), + ( + "subject_areas", + models.JSONField( + default=list, + verbose_name="Subject Areas (CAPES)", + ), + ), + ( + "wos_subject_areas", + models.JSONField( + default=list, + verbose_name="Subject Areas (WoS)", + ), + ), + ( + "default_lang", + models.CharField( + blank=True, + max_length=8, + null=True, + verbose_name="Default Language", + ), + ), + ( + "publication_date", + models.CharField( + blank=True, + max_length=32, + null=True, + verbose_name="Publication Date", + ), + ), + ( + "publication_year", + models.CharField( + blank=True, + db_index=True, + max_length=4, + null=True, + verbose_name="Publication Year", + ), + ), + ( + "extra_data", + models.JSONField( + blank=True, + default=dict, + null=True, + verbose_name="Extra Data", + ), + ), + ( + "collection", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="collection.collection", + verbose_name="Collection", + ), + ), + ( + "creator", + models.ForeignKey( + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_creator", + to=settings.AUTH_USER_MODEL, + verbose_name="Creator", + ), + ), + ( + "updated_by", + models.ForeignKey( + blank=True, + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_last_mod_user", + to=settings.AUTH_USER_MODEL, + verbose_name="Updater", + ), + ), + ], + options={ + "verbose_name": "Source", + "verbose_name_plural": "Sources", + "unique_together": {("collection", "source_type", "source_id")}, + "indexes": [ + models.Index( + fields=["collection", "source_type"], + name="source_collection_type_idx", + ), + models.Index( + fields=["collection", "scielo_issn"], + name="source_collection_issn_idx", + ), + ], + }, + ), + ] diff --git a/source/migrations/0002_source_access_type.py b/source/migrations/0002_source_access_type.py new file mode 100644 index 0000000..e148c15 --- /dev/null +++ b/source/migrations/0002_source_access_type.py @@ -0,0 +1,25 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("source", "0001_initial"), + ] + + operations = [ + migrations.AddField( + model_name="source", + name="access_type", + field=models.CharField( + blank=True, + choices=[ + ("open_access", "Open Access"), + ("commercial", "Commercial"), + ], + db_index=True, + max_length=32, + null=True, + verbose_name="Access Type", + ), + ), + ] diff --git a/source/migrations/0003_alter_source_title.py b/source/migrations/0003_alter_source_title.py new file mode 100644 index 0000000..354a82a --- /dev/null +++ b/source/migrations/0003_alter_source_title.py @@ -0,0 +1,15 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("source", "0002_source_access_type"), + ] + + operations = [ + migrations.AlterField( + model_name="source", + name="title", + field=models.CharField(max_length=500, verbose_name="Source Title"), + ), + ] diff --git a/source/migrations/__init__.py b/source/migrations/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/source/migrations/__init__.py @@ -0,0 +1 @@ + diff --git a/source/models.py b/source/models.py new file mode 100644 index 0000000..48d3e00 --- /dev/null +++ b/source/models.py @@ -0,0 +1,219 @@ +from django.db import models +from django.utils.translation import gettext_lazy as _ + +from collection.models import Collection +from core.models import CommonControlField + + +class Source(CommonControlField): + SOURCE_TYPE_JOURNAL = "journal" + SOURCE_TYPE_BOOK = "book" + SOURCE_TYPE_PREPRINT_SERVER = "preprint_server" + SOURCE_TYPE_DATA_REPOSITORY = "data_repository" + SOURCE_TYPE_OTHER = "other" + SOURCE_TYPE_CHOICES = ( + (SOURCE_TYPE_JOURNAL, _("Journal")), + (SOURCE_TYPE_BOOK, _("Book")), + (SOURCE_TYPE_PREPRINT_SERVER, _("Preprint Server")), + (SOURCE_TYPE_DATA_REPOSITORY, _("Data Repository")), + (SOURCE_TYPE_OTHER, _("Other")), + ) + + ACCESS_TYPE_OPEN_ACCESS = "open_access" + ACCESS_TYPE_COMMERCIAL = "commercial" + ACCESS_TYPE_CHOICES = ( + (ACCESS_TYPE_OPEN_ACCESS, _("Open Access")), + (ACCESS_TYPE_COMMERCIAL, _("Commercial")), + ) + + collection = models.ForeignKey( + Collection, + verbose_name=_("Collection"), + on_delete=models.CASCADE, + blank=False, + null=False, + db_index=True, + ) + + source_type = models.CharField( + verbose_name=_("Source Type"), + max_length=32, + choices=SOURCE_TYPE_CHOICES, + blank=False, + null=False, + db_index=True, + ) + + source_id = models.CharField( + verbose_name=_("Source ID"), + max_length=255, + blank=False, + null=False, + db_index=True, + ) + + scielo_issn = models.CharField( + verbose_name=_("SciELO ISSN"), + max_length=9, + blank=True, + null=True, + db_index=True, + ) + + acronym = models.CharField( + verbose_name=_("Source Acronym"), + max_length=64, + blank=True, + null=True, + default="", + ) + + title = models.CharField( + verbose_name=_("Source Title"), + max_length=500, + blank=False, + null=False, + ) + + identifiers = models.JSONField( + verbose_name=_("Identifiers"), + null=True, + blank=True, + default=dict, + ) + + publisher_name = models.JSONField( + verbose_name=_("Publisher Name"), + blank=True, + null=True, + default=list, + ) + + subject_areas = models.JSONField( + verbose_name=_("Subject Areas (CAPES)"), + null=False, + blank=False, + default=list, + ) + + wos_subject_areas = models.JSONField( + verbose_name=_("Subject Areas (WoS)"), + null=False, + blank=False, + default=list, + ) + + default_lang = models.CharField( + verbose_name=_("Default Language"), + max_length=8, + blank=True, + null=True, + ) + + publication_date = models.CharField( + verbose_name=_("Publication Date"), + max_length=32, + blank=True, + null=True, + ) + + publication_year = models.CharField( + verbose_name=_("Publication Year"), + max_length=4, + blank=True, + null=True, + db_index=True, + ) + + access_type = models.CharField( + verbose_name=_("Access Type"), + max_length=32, + choices=ACCESS_TYPE_CHOICES, + blank=True, + null=True, + db_index=True, + ) + + extra_data = models.JSONField( + verbose_name=_("Extra Data"), + null=True, + blank=True, + default=dict, + ) + + def __str__(self): + return f"{self.collection.acron3} - {self.source_type} - {self.source_id}" + + @staticmethod + def _extract_issns(identifiers): + if not isinstance(identifiers, dict): + return set() + + return { + value + for key, value in identifiers.items() + if value and "issn" in str(key).lower() + } + + @classmethod + def metadata(cls, collection=None): + queryset = cls.objects.select_related("collection").only( + "acronym", + "collection__acron3", + "default_lang", + "extra_data", + "identifiers", + "publication_date", + "publication_year", + "access_type", + "publisher_name", + "scielo_issn", + "source_id", + "source_type", + "subject_areas", + "title", + "wos_subject_areas", + ) + + if collection: + queryset = queryset.filter(collection=collection) + + for source in queryset.iterator(): + identifiers = source.identifiers or {} + yield { + "acronym": source.acronym, + "collection": source.collection.acron3, + "default_lang": source.default_lang, + "extra_data": source.extra_data or {}, + "identifiers": identifiers, + "issns": cls._extract_issns(identifiers), + "publication_date": source.publication_date, + "publication_year": source.publication_year, + "access_type": source.access_type, + "publisher_name": source.publisher_name or [], + "scielo_issn": source.scielo_issn, + "source_id": source.source_id, + "source_type": source.source_type, + "subject_areas": source.subject_areas or [], + "title": source.title, + "wos_subject_areas": source.wos_subject_areas or [], + } + + class Meta: + verbose_name = _("Source") + verbose_name_plural = _("Sources") + unique_together = ( + "collection", + "source_type", + "source_id", + ) + indexes = [ + models.Index( + fields=["collection", "source_type"], + name="source_collection_type_idx", + ), + models.Index( + fields=["collection", "scielo_issn"], + name="source_collection_issn_idx", + ), + ] diff --git a/source/services/__init__.py b/source/services/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/source/services/__init__.py @@ -0,0 +1 @@ + diff --git a/source/services/books.py b/source/services/books.py new file mode 100644 index 0000000..df9bd4d --- /dev/null +++ b/source/services/books.py @@ -0,0 +1,137 @@ +from collection.models import Collection +from source.models import Source + + +BOOKS_COLLECTION_ACRONYM = "books" + + +def get_books_collection(acronym=BOOKS_COLLECTION_ACRONYM): + return Collection.objects.get(acron3=acronym) + + +def upsert_monograph_source( + payload, + collection, + user=None, + force_update=True, + source_url=None, + last_seq=None, +): + if payload.get("TYPE") != "Monograph": + return None + + source, created = Source.objects.get_or_create( + collection=collection, + source_type=Source.SOURCE_TYPE_BOOK, + source_id=str(payload.get("id")), + ) + + if created and user: + source.creator = user + + if created or force_update: + source.scielo_issn = None + source.acronym = "" + source.title = payload.get("title") or str(payload.get("id")) + source.identifiers = _build_source_identifiers(payload) + source.publisher_name = _as_list(payload.get("publisher")) + source.subject_areas = [] + source.wos_subject_areas = [] + source.default_lang = payload.get("language") or None + source.publication_date = payload.get("publication_date") or None + source.publication_year = _normalize_year(payload.get("year")) + source.access_type = _normalize_access_type(payload.get("is_comercial")) + source.extra_data = _build_source_extra_data( + payload, + source_url=source_url, + last_seq=last_seq, + ) + + if user: + source.updated_by = user + + source.save() + return source + + +def delete_book_source(collection, book_id): + return Source.objects.filter( + collection=collection, + source_type=Source.SOURCE_TYPE_BOOK, + source_id=str(book_id), + ).delete() + + +def _build_source_identifiers(payload): + identifiers = { + "book_id": str(payload.get("id")) if payload.get("id") is not None else None, + "isbn": payload.get("isbn"), + "eisbn": payload.get("eisbn"), + "doi": payload.get("doi_number"), + } + return _compact_dict(identifiers) + + +def _build_source_extra_data(payload, source_url=None, last_seq=None): + extra_data = { + "raw_type": payload.get("TYPE"), + "source_url": source_url, + "last_seq": last_seq, + "visible": payload.get("visible"), + "city": payload.get("city"), + "country": payload.get("country"), + "pages": payload.get("pages"), + "collection_data": payload.get("collection"), + "creators": payload.get("creators"), + "is_comercial": payload.get("is_comercial"), + "use_licence": payload.get("use_licence"), + "price_reais": payload.get("price_reais"), + "price_dollar": payload.get("price_dollar"), + "shopping_info": payload.get("shopping_info"), + "serie": payload.get("serie"), + "format": payload.get("format"), + "translated_titles": payload.get("translated_titles"), + "translated_synopses": payload.get("translated_synopses"), + "synopsis": payload.get("synopsis"), + "primary_descriptor": payload.get("primary_descriptor"), + "translated_primary_descriptors": payload.get("translated_primary_descriptors"), + } + return _compact_dict(extra_data) + + +def _as_list(value): + if not value: + return [] + + if isinstance(value, list): + return value + + return [value] + + +def _normalize_year(value): + if value in (None, ""): + return None + return str(value)[:4] + + +def _normalize_access_type(value): + if value in (None, ""): + return None + + if isinstance(value, str): + normalized = value.strip().lower() + if normalized in {"true", "1", "yes", "y", "sim"}: + return Source.ACCESS_TYPE_COMMERCIAL + if normalized in {"false", "0", "no", "n", "nao", "não"}: + return Source.ACCESS_TYPE_OPEN_ACCESS + + return Source.ACCESS_TYPE_COMMERCIAL if bool(value) else Source.ACCESS_TYPE_OPEN_ACCESS + + +def _compact_dict(data): + return { + key: value + for key, value in data.items() + if value not in (None, "", [], {}, ()) + } diff --git a/source/services/journals.py b/source/services/journals.py new file mode 100644 index 0000000..ac133f6 --- /dev/null +++ b/source/services/journals.py @@ -0,0 +1,118 @@ +from django.db.models import Q + +from collection.models import Collection +from source.models import Source + + +def get_collection(acronym): + return Collection.objects.filter(acron3=acronym).first() + + +def upsert_journal_source( + journal, + collection, + user=None, + force_update=True, + load_mode=None, +): + scielo_issn = _value(journal, "scielo_issn") + if not scielo_issn: + return None + + source, created = Source.objects.get_or_create( + collection=collection, + source_type=Source.SOURCE_TYPE_JOURNAL, + source_id=scielo_issn, + ) + + if created and user: + source.creator = user + + if created or force_update: + source.scielo_issn = scielo_issn + source.acronym = _value(journal, "acronym") or "" + source.title = _value(journal, "title") or scielo_issn + source.identifiers = _build_source_identifiers(journal) + source.publisher_name = _as_list(_value(journal, "publisher_name")) + source.subject_areas = _as_list(_value(journal, "subject_areas")) + source.wos_subject_areas = _as_list(_value(journal, "wos_subject_areas")) + source.default_lang = None + source.publication_date = None + source.publication_year = None + source.extra_data = _compact_dict( + { + "collection_acronym": _value(journal, "collection_acronym"), + "load_mode": load_mode, + } + ) + + if user: + source.updated_by = user + + source.save() + return source + + +def find_journal_source_by_issns(collection, issns): + for issn in filter(None, issns or []): + source = ( + Source.objects.filter( + collection=collection, + source_type=Source.SOURCE_TYPE_JOURNAL, + ) + .filter( + Q(scielo_issn=issn) + | Q(source_id=issn) + | Q(identifiers__electronic_issn=issn) + | Q(identifiers__print_issn=issn) + | Q(identifiers__scielo_issn=issn) + ) + .first() + ) + if source: + return source + return None + + +def find_journal_source_by_acronym(collection, acronym): + if not acronym: + return None + + return Source.objects.filter( + collection=collection, + source_type=Source.SOURCE_TYPE_JOURNAL, + acronym=acronym, + ).first() + + +def _build_source_identifiers(journal): + identifiers = { + "electronic_issn": _value(journal, "electronic_issn"), + "print_issn": _value(journal, "print_issn"), + "scielo_issn": _value(journal, "scielo_issn"), + } + return _compact_dict(identifiers) + + +def _as_list(value): + if not value: + return [] + + if isinstance(value, list): + return value + + return [value] + + +def _value(data, key, default=None): + if isinstance(data, dict): + return data.get(key, default) + return getattr(data, key, default) + + +def _compact_dict(data): + return { + key: value + for key, value in data.items() + if value not in (None, "", [], {}, ()) + } diff --git a/source/tasks.py b/source/tasks.py new file mode 100644 index 0000000..eb1633b --- /dev/null +++ b/source/tasks.py @@ -0,0 +1,148 @@ +import logging + +from django.utils.translation import gettext as _ +from django.conf import settings + +from collection.models import Collection +from config import celery_app +from core.collectors import articlemeta as articlemeta_collector +from core.collectors import scielo_books as scielo_books_collector +from core.utils.request_utils import _get_user +from source.services import books as books_service +from source.services import journals as journal_service + + +def load_sources_from_article_meta( + collections=None, + force_update=True, + user=None, + mode="thrift", +): + collection_codes = collections or Collection.acron3_list() + + for collection_code in collection_codes: + logging.info( + "Loading sources from Article Meta. Collection: %s, Mode: %s", + collection_code, + mode, + ) + + for journal in articlemeta_collector.iter_journals( + collection=collection_code, + mode=mode, + ): + collection = journal_service.get_collection(journal.collection_acronym) + if not collection: + logging.error( + "Collection %s does not exist", + journal.collection_acronym, + ) + continue + + source = journal_service.upsert_journal_source( + journal, + collection=collection, + user=user, + force_update=force_update, + load_mode=mode, + ) + logging.info( + "Source %s upserted for collection %s", + source.source_id if source else None, + collection.acron3, + ) + + return True + + +def load_sources_from_scielo_books( + collection="books", + db_name=settings.SCIELO_BOOKS_DB_NAME, + since=0, + limit=settings.SCIELO_BOOKS_LIMIT, + force_update=True, + headers=None, + base_url=None, + user=None, +): + collection_obj = books_service.get_books_collection(collection) + + logging.info( + "Loading sources from SciELO Books. Collection: %s, DB: %s, Since: %s, Limit: %s", + collection, + db_name, + since, + limit, + ) + + for item in scielo_books_collector.iter_change_documents( + base_url=base_url, + db_name=db_name, + since=since, + limit=limit, + headers=headers, + ): + change = item["change"] + + if item["deleted"]: + books_service.delete_book_source(collection_obj, change.get("id")) + continue + + payload = item["payload"] or {} + if payload.get("TYPE") != "Monograph": + continue + + books_service.upsert_monograph_source( + payload, + collection=collection_obj, + user=user, + force_update=force_update, + source_url=item.get("source_url"), + last_seq=change.get("seq"), + ) + + return True + + +@celery_app.task(bind=True, name=_("[Metadata] Sync Sources (Article Meta)"), queue="load") +def task_load_sources_from_article_meta( + self, + collections=None, + force_update=True, + user_id=None, + username=None, + mode="thrift", +): + user = _get_user(self.request, username=username, user_id=user_id) + return load_sources_from_article_meta( + collections=collections, + force_update=force_update, + user=user, + mode=mode, + ) + + +@celery_app.task(bind=True, name=_("[Metadata] Sync Sources (SciELO Books)"), queue="load") +def task_load_sources_from_scielo_books( + self, + collection="books", + db_name=settings.SCIELO_BOOKS_DB_NAME, + since=0, + limit=settings.SCIELO_BOOKS_LIMIT, + force_update=True, + headers=None, + base_url=None, + user_id=None, + username=None, +): + user = _get_user(self.request, username=username, user_id=user_id) + return load_sources_from_scielo_books( + collection=collection, + db_name=db_name, + since=since, + limit=limit, + force_update=force_update, + headers=headers, + base_url=base_url, + user=user, + ) diff --git a/source/tests.py b/source/tests.py new file mode 100644 index 0000000..a182f4e --- /dev/null +++ b/source/tests.py @@ -0,0 +1,133 @@ +from django.test import TestCase + +from collection.models import Collection + +from .models import Source +from .services import books as books_service +from .services import journals as journal_service + + +class SourceMetadataTests(TestCase): + def test_source_type_choices_include_scielo_non_journal_sources(self): + self.assertIn( + (Source.SOURCE_TYPE_PREPRINT_SERVER, "Preprint Server"), + [(value, str(label)) for value, label in Source.SOURCE_TYPE_CHOICES], + ) + self.assertIn( + (Source.SOURCE_TYPE_DATA_REPOSITORY, "Data Repository"), + [(value, str(label)) for value, label in Source.SOURCE_TYPE_CHOICES], + ) + + def test_metadata_exposes_generic_and_journal_fields(self): + collection = Collection.objects.create(acron3="scl", acron2="sc") + Source.objects.create( + collection=collection, + source_type=Source.SOURCE_TYPE_JOURNAL, + source_id="1234-5678", + scielo_issn="1234-5678", + acronym="testjou", + title="Test Journal", + identifiers={ + "electronic_issn": "1234-5678", + "print_issn": "8765-4321", + "doi": "10.1590/example", + }, + publisher_name=["SciELO"], + subject_areas=["Health Sciences"], + wos_subject_areas=["Medicine"], + default_lang="en", + publication_date="2024-01-15", + publication_year="2024", + extra_data={"country": "BR"}, + ) + + metadata = list(Source.metadata(collection=collection)) + + self.assertEqual(len(metadata), 1) + self.assertEqual(metadata[0]["source_type"], Source.SOURCE_TYPE_JOURNAL) + self.assertEqual(metadata[0]["source_id"], "1234-5678") + self.assertEqual(metadata[0]["scielo_issn"], "1234-5678") + self.assertEqual(metadata[0]["issns"], {"1234-5678", "8765-4321"}) + self.assertEqual(metadata[0]["title"], "Test Journal") + + def test_upsert_monograph_source_maps_scielo_books_payload(self): + collection = Collection.objects.create(acron3="books", acron2="bk") + + source = books_service.upsert_monograph_source( + { + "TYPE": "Monograph", + "id": "abcd1", + "title": "Sample Book", + "isbn": "9788578791889", + "eisbn": "9788578791880", + "doi_number": "10.1234/book", + "language": "pt", + "publication_date": "2024-05-20", + "year": "2024", + "publisher": "SciELO Books", + "is_comercial": False, + "visible": True, + }, + collection=collection, + ) + + self.assertEqual(source.source_type, Source.SOURCE_TYPE_BOOK) + self.assertEqual(source.source_id, "abcd1") + self.assertEqual(source.identifiers["isbn"], "9788578791889") + self.assertEqual(source.default_lang, "pt") + self.assertEqual(source.publication_year, "2024") + self.assertEqual(source.access_type, Source.ACCESS_TYPE_OPEN_ACCESS) + + def test_upsert_monograph_source_accepts_long_real_world_title(self): + collection = Collection.objects.create(acron3="books", acron2="bk") + title = ( + "O Estado da Arte sobre Refugiados, Deslocados Internos, " + "Deslocados Ambientais e Apatridas no Brasil: atualizacao do " + "Diretorio Nacional do ACNUR de teses, dissertacoes, trabalhos " + "de conclusao de curso de graduacao em Joao Pessoa (Paraiba) e " + "artigos (2007 a 2017)" + ) + + source = books_service.upsert_monograph_source( + { + "TYPE": "Monograph", + "id": "9zzts", + "title": title, + }, + collection=collection, + ) + + self.assertEqual(source.title, title) + + def test_upsert_journal_source_maps_articlemeta_payload(self): + collection = Collection.objects.create(acron3="scl", acron2="sc") + + source = journal_service.upsert_journal_source( + { + "collection_acronym": "scl", + "scielo_issn": "1234-5678", + "electronic_issn": "1234-5678", + "print_issn": "8765-4321", + "acronym": "testjou", + "title": "Test Journal", + "publisher_name": "SciELO", + "subject_areas": ["Health Sciences"], + "wos_subject_areas": ["Medicine"], + }, + collection=collection, + load_mode="thrift", + ) + + self.assertEqual(source.source_type, Source.SOURCE_TYPE_JOURNAL) + self.assertEqual(source.source_id, "1234-5678") + self.assertEqual(source.identifiers["electronic_issn"], "1234-5678") + self.assertEqual(source.publisher_name, ["SciELO"]) + self.assertEqual(source.extra_data["load_mode"], "thrift") + self.assertEqual( + journal_service.find_journal_source_by_issns(collection, ["8765-4321"]).pk, + source.pk, + ) + self.assertEqual( + journal_service.find_journal_source_by_acronym(collection, "testjou").pk, + source.pk, + ) diff --git a/source/wagtail_hooks.py b/source/wagtail_hooks.py new file mode 100644 index 0000000..5ffad62 --- /dev/null +++ b/source/wagtail_hooks.py @@ -0,0 +1,32 @@ +from django.utils.translation import gettext_lazy as _ +from wagtail.snippets.views.snippets import SnippetViewSet + +from .models import Source + + +class SourceSnippetViewSet(SnippetViewSet): + model = Source + icon = "folder-open-inverse" + menu_label = _("Source") + menu_order = 200 + + list_display = ( + "collection", + "source_type", + "source_id", + "scielo_issn", + "acronym", + "title", + "publication_year", + ) + list_filter = ( + "collection", + "source_type", + "publication_year", + ) + search_fields = ( + "source_id", + "scielo_issn", + "acronym", + "title", + ) diff --git a/start-dev.sh b/start-dev.sh deleted file mode 100644 index 92d064a..0000000 --- a/start-dev.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -# Change this value to the local ethernet. -ethernet=wlp0s20f3 - -# Linux IP. -export IP=$(/sbin/ip -o -4 addr list $ethernet | awk '{print $4}' | cut -d/ -f1) - -# Mac OS IP. -#export IP=$(ifconfig $ethernet | grep inet | grep -v inet6 | awk '{print $2}') - -export DATABASE_URL=postgres://GVRFlLmcCNfGLhsFvSnCioYOPJPYpyfj:BQ4hSUL4rdj5WZLdR8ilDLRQMvCtzo0caMaXDO0olGsmycQjlcZlTVK9DepZR8kk@$IP:5432/scielo_core -export CELERY_BROKER_URL=redis://$IP:6379/0 -export USE_DOCKER=no -export IPYTHONDIR=/app/.ipython -export REDIS_URL=redis://$IP:6379/0 -export CELERY_FLOWER_USER=PhFRdLexbrsBvrrbSXxjcMMOcVOavCrZ -export CELERY_FLOWER_PASSWORD=QgScyefPrYhHgO6onW61u0nazc5xdBuP4sM7jMRrBBFuA2RjsFhZLp7xbVYZbrwR -export EMAIL_HOST=$IP -export SOLR_URL=http://$IP:8983/solr/ - - -docker stop scielo_core_local_django -# workon scms -python manage.py runserver_plus 0.0.0.0:8000 diff --git a/tracker/choices.py b/tracker/choices.py index e2c80e2..dfc562c 100644 --- a/tracker/choices.py +++ b/tracker/choices.py @@ -1,54 +1,16 @@ from django.utils.translation import gettext_lazy as _ -ERROR = "ERROR" -EXCEPTION = "EXCEPTION" -INFO = "INFO" -WARNING = "WARNING" - -EVENT_MSG_TYPE = [ - (ERROR, _("error")), - (WARNING, _("warning")), - (INFO, _("info")), - (EXCEPTION, _("exception")), -] - - -PROGRESS_STATUS_IGNORED = "IGNORED" -PROGRESS_STATUS_REPROC = "REPROC" -PROGRESS_STATUS_TODO = "TODO" -PROGRESS_STATUS_DOING = "DOING" -PROGRESS_STATUS_DONE = "DONE" -PROGRESS_STATUS_PENDING = "PENDING" - -PROGRESS_STATUS = ( - (PROGRESS_STATUS_REPROC, _("To reprocess")), - (PROGRESS_STATUS_TODO, _("To do")), - (PROGRESS_STATUS_DONE, _("Done")), - (PROGRESS_STATUS_DOING, _("Doing")), - (PROGRESS_STATUS_PENDING, _("Pending")), - (PROGRESS_STATUS_IGNORED, _("ignored")), -) - LOG_FILE_DISCARDED_LINE_REASON_MISSING_METADATA = 'MET' -LOG_FILE_DISCARDED_LINE_REASON_MISSING_ARTICLE = 'ART' -LOG_FILE_DISCARDED_LINE_REASON_MISSING_JOURNAL = 'JOU' +LOG_FILE_DISCARDED_LINE_REASON_MISSING_DOCUMENT = 'DOC' +LOG_FILE_DISCARDED_LINE_REASON_MISSING_SOURCE = 'SRC' LOG_FILE_DISCARDED_LINE_REASON_URL_TRANSLATION = 'URL' LOG_FILE_DISCARDED_LINE_REASON_DATABASE_ERROR = 'DBE' LOG_FILE_DISCARDED_LINE_REASON = [ (LOG_FILE_DISCARDED_LINE_REASON_MISSING_METADATA, _("Missing Metadata")), - (LOG_FILE_DISCARDED_LINE_REASON_MISSING_ARTICLE, _("Missing PIDv2 or PIDv3 or PID Generic")), - (LOG_FILE_DISCARDED_LINE_REASON_MISSING_JOURNAL, _("Missing ISSN")), + (LOG_FILE_DISCARDED_LINE_REASON_MISSING_DOCUMENT, _("Missing Document")), + (LOG_FILE_DISCARDED_LINE_REASON_MISSING_SOURCE, _("Missing Source")), (LOG_FILE_DISCARDED_LINE_REASON_URL_TRANSLATION, _("URL Translation")), (LOG_FILE_DISCARDED_LINE_REASON_DATABASE_ERROR, _("Database Error")), ] - - -ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED = 'MUL' -ARTICLE_EVENT_TYPE_DATA_ERROR = 'ERR' - -ARTICLE_EVENT_TYPE = [ - (ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED, _("Multiple Articles Returned")), - (ARTICLE_EVENT_TYPE_DATA_ERROR, _("Data Error")), -] diff --git a/tracker/exceptions.py b/tracker/exceptions.py index 31ed8c8..9ef3267 100644 --- a/tracker/exceptions.py +++ b/tracker/exceptions.py @@ -1,26 +1,2 @@ -class ProcEventCreateError(Exception): - ... - -class UnexpectedEventCreateError(Exception): - ... - -class EventCreateError(Exception): - ... - -class EventReportCreateError(Exception): - ... - -class EventReportSaveFileError(Exception): - ... - -class EventReportCreateError(Exception): - ... - -class EventReportDeleteEventsError(Exception): - ... - class LogFileDiscardedLineCreateError(Exception): ... - -class ArticleEventError(Exception): - ... diff --git a/tracker/migrations/0001_initial.py b/tracker/migrations/0001_initial.py index f207722..04fdc35 100644 --- a/tracker/migrations/0001_initial.py +++ b/tracker/migrations/0001_initial.py @@ -1,13 +1,18 @@ -# Generated by Django 5.0.7 on 2024-08-30 00:52 +# Generated by Codex on 2026-04-27 +import django.db.models.deletion import uuid +from django.conf import settings from django.db import migrations, models class Migration(migrations.Migration): initial = True - dependencies = [] + dependencies = [ + ("log_manager", "0001_initial"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] operations = [ migrations.CreateModel( @@ -24,21 +29,15 @@ class Migration(migrations.Migration): ), ( "created", - models.DateTimeField( - auto_now_add=True, verbose_name="Creation date" - ), + models.DateTimeField(auto_now_add=True, verbose_name="Creation date"), ), ( "exception_type", - models.TextField( - blank=True, null=True, verbose_name="Exception Type" - ), + models.TextField(blank=True, null=True, verbose_name="Exception Type"), ), ( "exception_msg", - models.TextField( - blank=True, null=True, verbose_name="Exception Msg" - ), + models.TextField(blank=True, null=True, verbose_name="Exception Msg"), ), ("traceback", models.JSONField(blank=True, null=True)), ("detail", models.JSONField(blank=True, null=True)), @@ -46,9 +45,148 @@ class Migration(migrations.Migration): options={ "indexes": [ models.Index( - fields=["exception_type"], name="tracker_une_excepti_47ede4_idx" + fields=["exception_type"], + name="tracker_une_excepti_47ede4_idx", ) ], }, ), + migrations.CreateModel( + name="ArticleEvent", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "created", + models.DateTimeField(auto_now_add=True, verbose_name="Creation date"), + ), + ( + "updated", + models.DateTimeField(auto_now=True, verbose_name="Last update date"), + ), + ( + "event_type", + models.CharField( + blank=True, + choices=[ + ("MUL", "Multiple Articles Returned"), + ("ERR", "Data Error"), + ], + max_length=3, + null=True, + verbose_name="Event Type", + ), + ), + ( + "message", + models.TextField(blank=True, null=True, verbose_name="Message"), + ), + ("data", models.JSONField(default=dict, verbose_name="Data")), + ("handled", models.BooleanField(default=False, verbose_name="Handled")), + ( + "creator", + models.ForeignKey( + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_creator", + to=settings.AUTH_USER_MODEL, + verbose_name="Creator", + ), + ), + ( + "updated_by", + models.ForeignKey( + blank=True, + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_last_mod_user", + to=settings.AUTH_USER_MODEL, + verbose_name="Updater", + ), + ), + ], + ), + migrations.CreateModel( + name="LogFileDiscardedLine", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "created", + models.DateTimeField(auto_now_add=True, verbose_name="Creation date"), + ), + ( + "updated", + models.DateTimeField(auto_now=True, verbose_name="Last update date"), + ), + ( + "error_type", + models.CharField( + blank=True, + choices=[ + ("MET", "Missing Metadata"), + ("DOC", "Missing Document"), + ("SRC", "Missing Source"), + ("URL", "URL Translation"), + ("DBE", "Database Error"), + ], + max_length=3, + null=True, + verbose_name="Error Type", + ), + ), + ("data", models.JSONField(default=dict, verbose_name="Data")), + ( + "message", + models.TextField(blank=True, null=True, verbose_name="Message"), + ), + ("handled", models.BooleanField(default=False, verbose_name="Handled")), + ( + "creator", + models.ForeignKey( + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_creator", + to=settings.AUTH_USER_MODEL, + verbose_name="Creator", + ), + ), + ( + "log_file", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="log_manager.logfile", + ), + ), + ( + "updated_by", + models.ForeignKey( + blank=True, + editable=False, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="%(class)s_last_mod_user", + to=settings.AUTH_USER_MODEL, + verbose_name="Updater", + ), + ), + ], + ), ] diff --git a/tracker/migrations/0002_remove_articleevent_creator_and_more.py b/tracker/migrations/0002_remove_articleevent_creator_and_more.py new file mode 100644 index 0000000..ee23c85 --- /dev/null +++ b/tracker/migrations/0002_remove_articleevent_creator_and_more.py @@ -0,0 +1,38 @@ +# Generated by Django 5.2.12 on 2026-05-01 22:23 + +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("tracker", "0001_initial"), + ] + + operations = [ + migrations.RemoveField( + model_name="articleevent", + name="creator", + ), + migrations.RemoveField( + model_name="articleevent", + name="updated_by", + ), + migrations.DeleteModel( + name="UnexpectedEvent", + ), + migrations.RemoveField( + model_name="logfilediscardedline", + name="creator", + ), + migrations.RemoveField( + model_name="logfilediscardedline", + name="updated", + ), + migrations.RemoveField( + model_name="logfilediscardedline", + name="updated_by", + ), + migrations.DeleteModel( + name="ArticleEvent", + ), + ] diff --git a/tracker/migrations/0002_top100articlesfileevent.py b/tracker/migrations/0002_top100articlesfileevent.py deleted file mode 100644 index 230fb8a..0000000 --- a/tracker/migrations/0002_top100articlesfileevent.py +++ /dev/null @@ -1,93 +0,0 @@ -# Generated by Django 5.0.7 on 2024-08-30 21:52 - -import django.db.models.deletion -from django.conf import settings -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("metrics", "0002_alter_top100articlesfile_status"), - ("tracker", "0001_initial"), - migrations.swappable_dependency(settings.AUTH_USER_MODEL), - ] - - operations = [ - migrations.CreateModel( - name="Top100ArticlesFileEvent", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "created", - models.DateTimeField( - auto_now_add=True, verbose_name="Creation date" - ), - ), - ( - "updated", - models.DateTimeField( - auto_now=True, verbose_name="Last update date" - ), - ), - ( - "status", - models.CharField( - blank=True, max_length=64, null=True, verbose_name="Status" - ), - ), - ( - "lines", - models.IntegerField( - blank=True, default=0, null=True, verbose_name="Lines" - ), - ), - ( - "message", - models.TextField(blank=True, null=True, verbose_name="Message"), - ), - ( - "creator", - models.ForeignKey( - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_creator", - to=settings.AUTH_USER_MODEL, - verbose_name="Creator", - ), - ), - ( - "file", - models.ForeignKey( - blank=True, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - to="metrics.top100articlesfile", - ), - ), - ( - "updated_by", - models.ForeignKey( - blank=True, - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_last_mod_user", - to=settings.AUTH_USER_MODEL, - verbose_name="Updater", - ), - ), - ], - options={ - "verbose_name_plural": "Top 100 Article File Events", - }, - ), - ] diff --git a/tracker/migrations/0003_logfilediscardedline_delete_top100articlesfileevent.py b/tracker/migrations/0003_logfilediscardedline_delete_top100articlesfileevent.py deleted file mode 100644 index 6e37a9f..0000000 --- a/tracker/migrations/0003_logfilediscardedline_delete_top100articlesfileevent.py +++ /dev/null @@ -1,98 +0,0 @@ -# Generated by Django 5.0.7 on 2025-03-07 16:55 - -import django.db.models.deletion -from django.conf import settings -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("log_manager", "0002_alter_collectionconfig_unique_together_and_more"), - ("tracker", "0002_top100articlesfileevent"), - migrations.swappable_dependency(settings.AUTH_USER_MODEL), - ] - - operations = [ - migrations.CreateModel( - name="LogFileDiscardedLine", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "created", - models.DateTimeField( - auto_now_add=True, verbose_name="Creation date" - ), - ), - ( - "updated", - models.DateTimeField( - auto_now=True, verbose_name="Last update date" - ), - ), - ( - "error_type", - models.CharField( - blank=True, - choices=[ - ("MET", "Missing Metadata"), - ("ART", "Missing Article"), - ("JOU", "Missing Journal"), - ], - max_length=3, - null=True, - verbose_name="Error Type", - ), - ), - ("data", models.JSONField(default=dict, verbose_name="Data")), - ( - "message", - models.TextField(blank=True, null=True, verbose_name="Message"), - ), - ("handled", models.BooleanField(default=False, verbose_name="Handled")), - ( - "creator", - models.ForeignKey( - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_creator", - to=settings.AUTH_USER_MODEL, - verbose_name="Creator", - ), - ), - ( - "log_file", - models.ForeignKey( - on_delete=django.db.models.deletion.CASCADE, - to="log_manager.logfile", - ), - ), - ( - "updated_by", - models.ForeignKey( - blank=True, - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_last_mod_user", - to=settings.AUTH_USER_MODEL, - verbose_name="Updater", - ), - ), - ], - options={ - "abstract": False, - }, - ), - migrations.DeleteModel( - name="Top100ArticlesFileEvent", - ), - ] diff --git a/tracker/migrations/0004_alter_logfilediscardedline_error_type.py b/tracker/migrations/0004_alter_logfilediscardedline_error_type.py deleted file mode 100644 index 1061793..0000000 --- a/tracker/migrations/0004_alter_logfilediscardedline_error_type.py +++ /dev/null @@ -1,28 +0,0 @@ -# Generated by Django 5.0.7 on 2025-03-27 20:40 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("tracker", "0003_logfilediscardedline_delete_top100articlesfileevent"), - ] - - operations = [ - migrations.AlterField( - model_name="logfilediscardedline", - name="error_type", - field=models.CharField( - blank=True, - choices=[ - ("MET", "Missing Metadata"), - ("ART", "Missing Article"), - ("JOU", "Missing Journal"), - ("URL", "URL Translation"), - ], - max_length=3, - null=True, - verbose_name="Error Type", - ), - ), - ] diff --git a/tracker/migrations/0005_articleevent.py b/tracker/migrations/0005_articleevent.py deleted file mode 100644 index 859910e..0000000 --- a/tracker/migrations/0005_articleevent.py +++ /dev/null @@ -1,86 +0,0 @@ -# Generated by Django 5.0.7 on 2025-05-23 17:27 - -import django.db.models.deletion -from django.conf import settings -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("tracker", "0004_alter_logfilediscardedline_error_type"), - migrations.swappable_dependency(settings.AUTH_USER_MODEL), - ] - - operations = [ - migrations.CreateModel( - name="ArticleEvent", - fields=[ - ( - "id", - models.BigAutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "created", - models.DateTimeField( - auto_now_add=True, verbose_name="Creation date" - ), - ), - ( - "updated", - models.DateTimeField( - auto_now=True, verbose_name="Last update date" - ), - ), - ( - "event_type", - models.CharField( - blank=True, - choices=[ - ("MUL", "Multiple Articles Returned"), - ("ERR", "Data Error"), - ], - max_length=3, - null=True, - verbose_name="Event Type", - ), - ), - ( - "message", - models.TextField(blank=True, null=True, verbose_name="Message"), - ), - ("data", models.JSONField(default=dict, verbose_name="Data")), - ("handled", models.BooleanField(default=False, verbose_name="Handled")), - ( - "creator", - models.ForeignKey( - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_creator", - to=settings.AUTH_USER_MODEL, - verbose_name="Creator", - ), - ), - ( - "updated_by", - models.ForeignKey( - blank=True, - editable=False, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="%(class)s_last_mod_user", - to=settings.AUTH_USER_MODEL, - verbose_name="Updater", - ), - ), - ], - options={ - "abstract": False, - }, - ), - ] diff --git a/tracker/migrations/0006_alter_logfilediscardedline_error_type.py b/tracker/migrations/0006_alter_logfilediscardedline_error_type.py deleted file mode 100644 index fb7f74a..0000000 --- a/tracker/migrations/0006_alter_logfilediscardedline_error_type.py +++ /dev/null @@ -1,29 +0,0 @@ -# Generated by Django 5.0.7 on 2025-06-14 10:46 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("tracker", "0005_articleevent"), - ] - - operations = [ - migrations.AlterField( - model_name="logfilediscardedline", - name="error_type", - field=models.CharField( - blank=True, - choices=[ - ("MET", "Missing Metadata"), - ("ART", "Missing Article"), - ("JOU", "Missing Journal"), - ("URL", "URL Translation"), - ("DBE", "Database Error"), - ], - max_length=3, - null=True, - verbose_name="Error Type", - ), - ), - ] diff --git a/tracker/migrations/0007_alter_logfilediscardedline_error_type.py b/tracker/migrations/0007_alter_logfilediscardedline_error_type.py deleted file mode 100644 index f9ffebe..0000000 --- a/tracker/migrations/0007_alter_logfilediscardedline_error_type.py +++ /dev/null @@ -1,29 +0,0 @@ -# Generated by Django 5.0.7 on 2025-08-09 21:04 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("tracker", "0006_alter_logfilediscardedline_error_type"), - ] - - operations = [ - migrations.AlterField( - model_name="logfilediscardedline", - name="error_type", - field=models.CharField( - blank=True, - choices=[ - ("MET", "Missing Metadata"), - ("ART", "Missing PIDv2 or PIDv3 or PID Generic"), - ("JOU", "Missing ISSN"), - ("URL", "URL Translation"), - ("DBE", "Database Error"), - ], - max_length=3, - null=True, - verbose_name="Error Type", - ), - ), - ] diff --git a/tracker/models.py b/tracker/models.py index 77086ee..a394ed6 100644 --- a/tracker/models.py +++ b/tracker/models.py @@ -1,65 +1,13 @@ -import json -import logging -import traceback -import uuid - -from datetime import datetime - -from django.core.files.base import ContentFile from django.db import models from django.utils.translation import gettext_lazy as _ -from core.models import CommonControlField from log_manager.models import LogFile from tracker import choices - -from .exceptions import * +from .exceptions import LogFileDiscardedLineCreateError -class ArticleEvent(CommonControlField): - event_type = models.CharField( - _("Event Type"), - choices=choices.ARTICLE_EVENT_TYPE, - max_length=3, - null=True, - blank=True, - ) - - message = models.TextField( - _("Message"), - null=True, - blank=True, - ) - - data = models.JSONField( - _("Data"), - default=dict, - ) - - handled = models.BooleanField( - _("Handled"), - default=False - ) - - @classmethod - def create(cls, event_type, message, data): - try: - obj = cls() - obj.event_type = event_type - obj.message = message - obj.data = data - obj.save() - except Exception as exc: - raise ArticleEventError( - f"Unable to create ArticleEvent ({data} - {event_type} - {message}). EXCEPTION {exc}" - ) - return obj - - def __str__(self): - return f"{self.event_type} - {self.message}" - - -class LogFileDiscardedLine(CommonControlField): +class LogFileDiscardedLine(models.Model): + created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True) log_file = models.ForeignKey( LogFile, on_delete=models.CASCADE, @@ -108,174 +56,4 @@ def __str__(self): return f"{self.data} - {self.message}" -class UnexpectedEvent(models.Model): - id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) - created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True) - exception_type = models.TextField(_("Exception Type"), null=True, blank=True) - exception_msg = models.TextField(_("Exception Msg"), null=True, blank=True) - traceback = models.JSONField(null=True, blank=True) - detail = models.JSONField(null=True, blank=True) - - class Meta: - indexes = [ - models.Index(fields=["exception_type"]), - ] - - def __str__(self): - return f"{self.exception_msg}" - - @property - def data(self): - return dict( - created=self.created.isoformat(), - exception_type=self.exception_type, - exception_msg=self.exception_msg, - traceback=json.dumps(self.traceback), - detail=json.dumps(self.detail), - ) - - @classmethod - def create( - cls, - exception=None, - exc_traceback=None, - detail=None, - ): - try: - if exception: - logging.exception(exception) - - obj = cls() - obj.exception_msg = str(exception) - obj.exception_type = str(type(exception)) - try: - json.dumps(detail) - obj.detail = detail - except Exception as e: - obj.detail = str(detail) - if exc_traceback: - obj.traceback = traceback.format_tb(exc_traceback) - obj.save() - return obj - except Exception as exc: - raise UnexpectedEventCreateError( - f"Unable to create unexpected event ({exception} {exc_traceback}). EXCEPTION {exc}" - ) - - -class Event(CommonControlField): - id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) - message = models.TextField(_("Message"), null=True, blank=True) - message_type = models.CharField( - _("Message type"), - choices=choices.EVENT_MSG_TYPE, - max_length=16, - null=True, - blank=True, - ) - detail = models.JSONField(null=True, blank=True) - unexpected_event = models.ForeignKey( - 'UnexpectedEvent', on_delete=models.SET_NULL, null=True, blank=True - ) - - class Meta: - abstract = True - indexes = [ - models.Index(fields=["message_type"]), - ] - - @property - def data(self): - d = {} - d["created"] = self.created.isoformat() - d["user"] = self.user.username - d.update( - dict( - message=self.message, message_type=self.message_type, detail=self.detail - ) - ) - if self.unexpected_event: - d.update(self.unexpected_event.data) - return d - - @classmethod - def create( - cls, - user=None, - message_type=None, - message=None, - e=None, - exc_traceback=None, - detail=None, - ): - try: - obj = cls() - obj.creator = user - obj.message = message - obj.message_type = message_type - obj.detail = detail - obj.save() - - if e: - logging.exception(f"{message}: {e}") - obj.unexpected_event = UnexpectedEvent.create( - exception=e, - exc_traceback=exc_traceback, - ) - obj.save() - except Exception as exc: - raise EventCreateError( - f"Unable to create Event ({message} {e}). EXCEPTION: {exc}" - ) - return obj - - -def tracker_file_directory_path(instance, filename): - d = datetime.now(datetime.timezone.utc) - return f"tracker/{d.year}/{d.month}/{d.day}/{filename}" - - -class EventReport(CommonControlField): - file = models.FileField( - upload_to=tracker_file_directory_path, null=True, blank=True - ) - - class Meta: - abstract = True - - def save_file(self, events, ext=None): - if not events: - return - try: - ext = ".json" - content = json.dumps(list([item.data for item in events])) - name = datetime.now(datetime.timezone.utc).isoformat() + ext - self.file.save(name, ContentFile(content)) - self.delete_events(events) - except Exception as e: - raise EventReportSaveFileError( - f"Unable to save EventReport.file ({name}). Exception: {e}" - ) - - def delete_events(self, events): - for item in events: - try: - item.unexpected_event.delete() - except: - pass - try: - item.delete() - except: - pass - - @classmethod - def create(cls, user): - try: - obj = cls() - obj.creator = user - obj.save() - except Exception as e: - raise EventReportCreateError( - f"Unable to create EventReport. Exception: {e}" - ) diff --git a/tracker/tasks.py b/tracker/tasks.py deleted file mode 100644 index ace8145..0000000 --- a/tracker/tasks.py +++ /dev/null @@ -1,34 +0,0 @@ -# tasks.py -from datetime import datetime - -from django.contrib.auth import get_user_model - -from config import celery_app -from core.utils.utils import _get_user - -from .models import UnexpectedEvent - - -User = get_user_model() - - -@celery_app.task(bind=True, name="Cleanup unexpected events") -def delete_unexpected_events(self, exception_type, start_date=None, end_date=None, user_id=None, username=None): - """ - Delete UnexpectedEvent records based on exception type and optional date range. - """ - user = _get_user(self.request, username=username, user_id=user_id) - - if exception_type == '__all__': - UnexpectedEvent.objects.all().delete() - return - - filters = {'exception_type__icontains': exception_type} - if start_date: - start_date = datetime.fromisoformat(start_date) - filters['created__gte'] = start_date - if end_date: - end_date = datetime.fromisoformat(end_date) - filters['created__lte'] = end_date - - UnexpectedEvent.objects.filter(**filters).delete() diff --git a/tracker/wagtail_hooks.py b/tracker/wagtail_hooks.py index ce1b30f..1ceb9c7 100644 --- a/tracker/wagtail_hooks.py +++ b/tracker/wagtail_hooks.py @@ -4,35 +4,9 @@ from config.menu import get_menu_order -from .models import UnexpectedEvent, LogFileDiscardedLine, ArticleEvent +from .models import LogFileDiscardedLine -class UnexpectedEventSnippetViewSet(SnippetViewSet): - model = UnexpectedEvent - menu_label = _("Unexpected Events") - icon = 'warning' - menu_order = get_menu_order("tracker") - add_to_admin_menu = False - - list_display = ( - "exception_type", - "exception_msg", - "traceback", - "created", - ) - list_filter = ("exception_type",) - search_fields = ( - "exception_msg", - "detail", - ) - inspect_view_fields = ( - "exception_type", - "exception_msg", - "traceback", - "detail", - "created", - ) - class LogFileDiscardedLineSnippetViewSet(SnippetViewSet): model = LogFileDiscardedLine menu_label = _("Discarded Lines") @@ -64,34 +38,7 @@ class LogFileDiscardedLineSnippetViewSet(SnippetViewSet): "handled", ) -class ArticleEventSnippetViewSet(SnippetViewSet): - model = ArticleEvent - menu_label = _("Article Events") - icon = 'warning' - menu_order = get_menu_order("tracker") - add_to_admin_menu = False - - list_display = ( - "event_type", - "message", - "data", - "handled", - ) - - list_filter = ( - "event_type", - "handled", - ) - search_fields = ( - "message", - ) - inspect_view_fields = ( - "event_type", - "message", - "data", - "handled", - ) class TrackerSnippetViewSetGroup(SnippetViewSetGroup): @@ -101,9 +48,7 @@ class TrackerSnippetViewSetGroup(SnippetViewSetGroup): menu_order = get_menu_order("tracker") items = ( - UnexpectedEventSnippetViewSet, LogFileDiscardedLineSnippetViewSet, - ArticleEventSnippetViewSet, )