diff --git a/.dockerignore b/.dockerignore
index 5518e60..79d8c95 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -4,7 +4,11 @@
.gitignore
.gitlab-ci.yml
.idea
+.ipython
.pre-commit-config.yaml
.readthedocs.yml
.travis.yml
+.continue
+.envs/.local/.django
+temp
venv
diff --git a/.envs/.local/.django b/.envs/.local/.django
index 168e012..53f6444 100644
--- a/.envs/.local/.django
+++ b/.envs/.local/.django
@@ -15,3 +15,13 @@ CELERY_BROKER_URL=redis://redis:6379/0
# ------------------------------------------------------------------------------
CELERY_FLOWER_USER=PhFRdLexbrsBvrrbSXxjcMMOcVOavCrZ
CELERY_FLOWER_PASSWORD=QgScyefPrYhHgO6onW61u0nazc5xdBuP4sM7jMRrBBFuA2RjsFhZLp7xbVYZbrwR
+
+# OpenSearch
+# ------------------------------------------------------------------------------
+OPENSEARCH_URL=http://172.20.0.1:9200
+OPENSEARCH_BASIC_AUTH=admin,UmaSenhaForte123!
+OPENSEARCH_VERIFY_CERTS=False
+
+# SciELO Books API
+--------------------------------------------------------------------------------
+SCIELO_BOOKS_BASE_URL=http://192.168.2.154:31735
diff --git a/.gitignore b/.gitignore
index 6342047..dd2c92d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -169,4 +169,11 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
+
+# Local agent/editor state
+.continue/
+temp/
+.envs/.local/.django
+start-dev.sh
+opencode.json
diff --git a/.pylintrc b/.pylintrc
deleted file mode 100644
index 55509fe..0000000
--- a/.pylintrc
+++ /dev/null
@@ -1,14 +0,0 @@
-[MASTER]
-load-plugins=pylint_django, pylint_celery
-django-settings-module=config.settings.base
-[FORMAT]
-max-line-length=120
-
-[MESSAGES CONTROL]
-disable=missing-docstring,invalid-name
-
-[DESIGN]
-max-parents=13
-
-[TYPECHECK]
-generated-members=REQUEST,acl_users,aq_parent,"[a-zA-Z]+_set{1,2}",save,delete
diff --git a/.readthedocs.yml b/.readthedocs.yml
deleted file mode 100644
index b4cf0c0..0000000
--- a/.readthedocs.yml
+++ /dev/null
@@ -1,12 +0,0 @@
-version: 2
-
-sphinx:
- configuration: docs/conf.py
-
-build:
- image: testing
-
-python:
- version: 3.9
- install:
- - requirements: requirements/local.txt
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..1f28439
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,83 @@
+# AGENTS.md
+
+## Project
+
+Django 5.2 + Wagtail 7.3 + Celery app that ingests SciELO access logs, validates them, and exports COUNTER-5 metrics to OpenSearch with monthly indices and daily nested metrics.
+
+## Key commands
+
+All commands run inside Docker via the `local.yml` compose file unless noted.
+
+```bash
+make build # build images
+make up # start all services (django, postgres, redis, celery worker+beat, mailhog)
+make django_shell # Django shell via docker compose
+make django_test # run full test suite (pytest)
+make django_fast # tests with --failfast
+make django_migrate # apply migrations
+make django_makemigrations # generate new migrations
+make django_createsuperuser # create Wagtail admin user
+```
+
+**Run a single test file/path:**
+```bash
+docker compose -f local.yml run --rm django pytest path/to/test_file.py
+```
+
+**Without Docker** (rare): use `start-dev.sh` after adjusting the ethernet interface name.
+
+## Architecture
+
+- **Wagtail admin**: `http://localhost:8009/admin` (NOT Django admin at `/django-admin/`)
+- **Django apps** (top-level dirs): `core` (Wagtail pages, users, utilities, collectors), `collection`, `log_manager`, `log_manager_config`, `metrics`, `document`, `reports`, `resources`, `source`, `tracker`, `core_settings`
+- **`core/`** contains utilities, shared models, Wagtail hooks, templates, and the `collectors/` subpackage. `config/` is the Django project package (settings, urls, celery_app, wsgi).
+- **Celery pipeline**: `task_daily_log_ingestion_pipeline` (auto-scheduled) chains Search -> Validate -> Parse -> Export using Celery chords. Individual steps can be triggered manually via Wagtail admin.
+- **Task names** use translatable strings, e.g. `_[Log Pipeline] 1. Search Logs (Manual)` — do not rename these casually, it breaks the schedule.
+
+## Settings
+
+- `DJANGO_SETTINGS_MODULE` defaults to `config.settings.local`
+- Tests use `config.settings.test` (set via `pytest.ini` `--ds=config.settings.test`)
+- Env files live in `.envs/.local/` (local) and `.envs/.production/` (production)
+- **`config/settings/test.py`** is minimal — it extends `base.py` and does NOT load local.py. If a test needs a setting that only exists in local.py, it must be added to test.py or set in the test directly.
+
+## Testing
+
+- Framework: **pytest** (not Django's `TestCase` runner), with `--reuse-db` by default
+- Config: `pytest.ini` sets `--ds=config.settings.test --reuse-db`
+- Both `unittest.TestCase` (Django-style) and pytest-style tests coexist; `pytest` is the runner
+- CI runs: `build -> makemigrations -> migrate -> pytest`
+- Shared fixtures in `core/conftest.py` (autouse `media_storage`, `user` fixture via factory-boy)
+
+## Linting & formatting
+
+- **black** (line length 120 implied by flake8 config; black defaults to 88 — pre-commit config pins it)
+- **isort** (black profile via `line_length=88`)
+- **flake8** (max-line-length=120 via setup.cfg)
+- Pre-commit runs all three on commit. Configuration in `setup.cfg` (flake8, isort, mypy) and `.pre-commit-config.yaml`.
+
+## Local dev quirks
+
+- Two SciELO libs (`scielo_log_validator`, `scielo_usage_counter`) are installed from local repos mounted at `/app/scielo_log_validator` and `/app/scielo_usage_counter` when `USE_LOCAL_SCIELO_LIBS=1`. The local Dockerfile strips these from `base.txt` during build and installs them from the mounted volumes via the entrypoint script.
+- Log files volume: `/mnt/pidata2/pi/scl/logs:/app/logs` (host-specific, may not exist on all machines)
+- Mailhog UI at `http://localhost:8029`
+- `manage.py` appends `core/` to `sys.path` so `from core.utils import ...` and `from utils import ...` both resolve.
+
+## OpenSearch
+
+- Client configured via `OPENSEARCH_URL`, `OPENSEARCH_BASIC_AUTH`, `OPENSEARCH_VERIFY_CERTS`
+- Index naming: `usage_monthly_{collection}_{year}` (e.g. `usage_monthly_books_2026`)
+- Upserts use Painless scripts for idempotent daily metric merging
+- `OPENSEARCH_INDEX_NAME` (default `usage`) and `OPENSEARCH_API_KEY` are defined in base settings but not widely used
+
+## MCP tools
+
+- When you need to search framework/library docs (Django, Wagtail, Celery, OpenSearch, etc.), use `context7` tools.
+- When you need to find code examples or patterns from open-source projects, use `gh_grep` tools.
+
+## Wagtail-specific notes
+
+- Multi-language: `pt-br` (default), `en`, `es`
+- Wagtail URL prefixes disabled (`prefix_default_language=False`)
+- After adding a language, run `make wagtail_sync` and `make wagtail_update_translation_field`
+- `wagtail-modeladmin` is used for managing pipeline entities in admin
diff --git a/COPYING b/COPYING
deleted file mode 100644
index 94a9ed0..0000000
--- a/COPYING
+++ /dev/null
@@ -1,674 +0,0 @@
- GNU GENERAL PUBLIC LICENSE
- Version 3, 29 June 2007
-
- Copyright (C) 2007 Free Software Foundation, Inc.
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
- Preamble
-
- The GNU General Public License is a free, copyleft license for
-software and other kinds of works.
-
- The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works. By contrast,
-the GNU General Public License is intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users. We, the Free Software Foundation, use the
-GNU General Public License for most of our software; it applies also to
-any other work released this way by its authors. You can apply it to
-your programs, too.
-
- When we speak of free software, we are referring to freedom, not
-price. Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-
- To protect your rights, we need to prevent others from denying you
-these rights or asking you to surrender the rights. Therefore, you have
-certain responsibilities if you distribute copies of the software, or if
-you modify it: responsibilities to respect the freedom of others.
-
- For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must pass on to the recipients the same
-freedoms that you received. You must make sure that they, too, receive
-or can get the source code. And you must show them these terms so they
-know their rights.
-
- Developers that use the GNU GPL protect your rights with two steps:
-(1) assert copyright on the software, and (2) offer you this License
-giving you legal permission to copy, distribute and/or modify it.
-
- For the developers' and authors' protection, the GPL clearly explains
-that there is no warranty for this free software. For both users' and
-authors' sake, the GPL requires that modified versions be marked as
-changed, so that their problems will not be attributed erroneously to
-authors of previous versions.
-
- Some devices are designed to deny users access to install or run
-modified versions of the software inside them, although the manufacturer
-can do so. This is fundamentally incompatible with the aim of
-protecting users' freedom to change the software. The systematic
-pattern of such abuse occurs in the area of products for individuals to
-use, which is precisely where it is most unacceptable. Therefore, we
-have designed this version of the GPL to prohibit the practice for those
-products. If such problems arise substantially in other domains, we
-stand ready to extend this provision to those domains in future versions
-of the GPL, as needed to protect the freedom of users.
-
- Finally, every program is threatened constantly by software patents.
-States should not allow patents to restrict development and use of
-software on general-purpose computers, but in those that do, we wish to
-avoid the special danger that patents applied to a free program could
-make it effectively proprietary. To prevent this, the GPL assures that
-patents cannot be used to render the program non-free.
-
- The precise terms and conditions for copying, distribution and
-modification follow.
-
- TERMS AND CONDITIONS
-
- 0. Definitions.
-
- "This License" refers to version 3 of the GNU General Public License.
-
- "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
- "The Program" refers to any copyrightable work licensed under this
-License. Each licensee is addressed as "you". "Licensees" and
-"recipients" may be individuals or organizations.
-
- To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy. The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
- A "covered work" means either the unmodified Program or a work based
-on the Program.
-
- To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy. Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
- To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies. Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
- An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License. If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
- 1. Source Code.
-
- The "source code" for a work means the preferred form of the work
-for making modifications to it. "Object code" means any non-source
-form of a work.
-
- A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
- The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form. A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
- The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities. However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work. For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
- The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
- The Corresponding Source for a work in source code form is that
-same work.
-
- 2. Basic Permissions.
-
- All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met. This License explicitly affirms your unlimited
-permission to run the unmodified Program. The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work. This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
- You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force. You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright. Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
- Conveying under any other circumstances is permitted solely under
-the conditions stated below. Sublicensing is not allowed; section 10
-makes it unnecessary.
-
- 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
- No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
- When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
- 4. Conveying Verbatim Copies.
-
- You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
- You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
- 5. Conveying Modified Source Versions.
-
- You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
- a) The work must carry prominent notices stating that you modified
- it, and giving a relevant date.
-
- b) The work must carry prominent notices stating that it is
- released under this License and any conditions added under section
- 7. This requirement modifies the requirement in section 4 to
- "keep intact all notices".
-
- c) You must license the entire work, as a whole, under this
- License to anyone who comes into possession of a copy. This
- License will therefore apply, along with any applicable section 7
- additional terms, to the whole of the work, and all its parts,
- regardless of how they are packaged. This License gives no
- permission to license the work in any other way, but it does not
- invalidate such permission if you have separately received it.
-
- d) If the work has interactive user interfaces, each must display
- Appropriate Legal Notices; however, if the Program has interactive
- interfaces that do not display Appropriate Legal Notices, your
- work need not make them do so.
-
- A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit. Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
- 6. Conveying Non-Source Forms.
-
- You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
- a) Convey the object code in, or embodied in, a physical product
- (including a physical distribution medium), accompanied by the
- Corresponding Source fixed on a durable physical medium
- customarily used for software interchange.
-
- b) Convey the object code in, or embodied in, a physical product
- (including a physical distribution medium), accompanied by a
- written offer, valid for at least three years and valid for as
- long as you offer spare parts or customer support for that product
- model, to give anyone who possesses the object code either (1) a
- copy of the Corresponding Source for all the software in the
- product that is covered by this License, on a durable physical
- medium customarily used for software interchange, for a price no
- more than your reasonable cost of physically performing this
- conveying of source, or (2) access to copy the
- Corresponding Source from a network server at no charge.
-
- c) Convey individual copies of the object code with a copy of the
- written offer to provide the Corresponding Source. This
- alternative is allowed only occasionally and noncommercially, and
- only if you received the object code with such an offer, in accord
- with subsection 6b.
-
- d) Convey the object code by offering access from a designated
- place (gratis or for a charge), and offer equivalent access to the
- Corresponding Source in the same way through the same place at no
- further charge. You need not require recipients to copy the
- Corresponding Source along with the object code. If the place to
- copy the object code is a network server, the Corresponding Source
- may be on a different server (operated by you or a third party)
- that supports equivalent copying facilities, provided you maintain
- clear directions next to the object code saying where to find the
- Corresponding Source. Regardless of what server hosts the
- Corresponding Source, you remain obligated to ensure that it is
- available for as long as needed to satisfy these requirements.
-
- e) Convey the object code using peer-to-peer transmission, provided
- you inform other peers where the object code and Corresponding
- Source of the work are being offered to the general public at no
- charge under subsection 6d.
-
- A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
- A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling. In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage. For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product. A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
- "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source. The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
- If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information. But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
- The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed. Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
- Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
- 7. Additional Terms.
-
- "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law. If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
- When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it. (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.) You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
- Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
- a) Disclaiming warranty or limiting liability differently from the
- terms of sections 15 and 16 of this License; or
-
- b) Requiring preservation of specified reasonable legal notices or
- author attributions in that material or in the Appropriate Legal
- Notices displayed by works containing it; or
-
- c) Prohibiting misrepresentation of the origin of that material, or
- requiring that modified versions of such material be marked in
- reasonable ways as different from the original version; or
-
- d) Limiting the use for publicity purposes of names of licensors or
- authors of the material; or
-
- e) Declining to grant rights under trademark law for use of some
- trade names, trademarks, or service marks; or
-
- f) Requiring indemnification of licensors and authors of that
- material by anyone who conveys the material (or modified versions of
- it) with contractual assumptions of liability to the recipient, for
- any liability that these contractual assumptions directly impose on
- those licensors and authors.
-
- All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10. If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term. If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
- If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
- Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
- 8. Termination.
-
- You may not propagate or modify a covered work except as expressly
-provided under this License. Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
- However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
- Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
- Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License. If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
- 9. Acceptance Not Required for Having Copies.
-
- You are not required to accept this License in order to receive or
-run a copy of the Program. Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance. However,
-nothing other than this License grants you permission to propagate or
-modify any covered work. These actions infringe copyright if you do
-not accept this License. Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
- 10. Automatic Licensing of Downstream Recipients.
-
- Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License. You are not responsible
-for enforcing compliance by third parties with this License.
-
- An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations. If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
- You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License. For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
- 11. Patents.
-
- A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based. The
-work thus licensed is called the contributor's "contributor version".
-
- A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version. For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-
- Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
- In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement). To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
- If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients. "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
- If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
- A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License. You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
- Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
- 12. No Surrender of Others' Freedom.
-
- If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License. If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all. For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-
- 13. Use with the GNU Affero General Public License.
-
- Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU Affero General Public License into a single
-combined work, and to convey the resulting work. The terms of this
-License will continue to apply to the part which is the covered work,
-but the special requirements of the GNU Affero General Public License,
-section 13, concerning interaction through a network will apply to the
-combination as such.
-
- 14. Revised Versions of this License.
-
- The Free Software Foundation may publish revised and/or new versions of
-the GNU General Public License from time to time. Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
- Each version is given a distinguishing version number. If the
-Program specifies that a certain numbered version of the GNU General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation. If the Program does not specify a version number of the
-GNU General Public License, you may choose any version ever published
-by the Free Software Foundation.
-
- If the Program specifies that a proxy can decide which future
-versions of the GNU General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-
- Later license versions may give you additional or different
-permissions. However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-
- 15. Disclaimer of Warranty.
-
- THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
- 16. Limitation of Liability.
-
- IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
- 17. Interpretation of Sections 15 and 16.
-
- If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-
- END OF TERMS AND CONDITIONS
-
- How to Apply These Terms to Your New Programs
-
- If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
- To do so, attach the following notices to the program. It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-
- Copyright (C)
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
-
-Also add information on how to contact you by electronic and paper mail.
-
- If the program does terminal interaction, make it output a short
-notice like this when it starts in an interactive mode:
-
- Copyright (C)
- This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
- This is free software, and you are welcome to redistribute it
- under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License. Of course, your program's commands
-might be different; for a GUI interface, you would use an "about box".
-
- You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU GPL, see
-.
-
- The GNU General Public License does not permit incorporating your program
-into proprietary programs. If your program is a subroutine library, you
-may consider it more useful to permit linking proprietary applications with
-the library. If this is what you want to do, use the GNU Lesser General
-Public License instead of this License. But first, please read
-.
diff --git a/README.md b/README.md
index 5347d9d..4dd7a22 100644
--- a/README.md
+++ b/README.md
@@ -1,25 +1,125 @@
-## SciELO Usage
+# SciELO Usage Metrics Pipeline
-## Dev Installation
+A modernized platform for processing and indexing SciELO usage logs into OpenSearch, adhering to COUNTER R5.1 standards.
-To build and run the application, being at the root of the project, you can follow these steps:
+**Version**: 2.0.0
+
+## Quick Start (Dev Installation)
+
+To build and run the application locally:
1. `make build compose=local.yml`
-2. `make django_makemigrations`
-3. `make django_migrate`
-4. `make django_createsuperuser`
-5. `make up`
-
-After the fifth step, the application should be functional and accessible at http://0.0.0.0:8009/admin
-
-### Additional notes:
-
-* The instructions assume that you have a working installation of Docker and `make`.
-* The `make` commands use the `compose` file `local.yml` to start the application containers.
-* The `django_makemigrations` and `django_migrate` commands are used to create and apply database migrations.
-* The `django_createsuperuser` command is used to create a superuser account for the application.
-* The `make up` command starts the application containers in the background.
-* The application is accessible at http://0.0.0.0:8009/admin.
-* To log in to the admin panel, you will need to use the superuser credentials that you created with the `django_createsuperuser` command.
-* The `Log Manager` tool can be used to view log files and manage application configurations.
-* To test the application, you will need to add some content, such as a list of collections and configurations.
+2. `make django_migrate`
+3. `make django_createsuperuser`
+4. `make up`
+
+The application will be accessible at [http://localhost:8009/admin](http://localhost:8009/admin).
+
+---
+
+## Key Commands
+
+All commands run inside Docker via the `local.yml` compose file unless noted.
+
+```bash
+make build # build images
+make up # start all services (django, postgres, redis, celery worker+beat, mailhog)
+make django_shell # Django shell via docker compose
+make django_test # run full test suite (pytest)
+make django_fast # tests with --failfast
+make django_migrate # apply migrations
+make django_makemigrations # generate new migrations
+make django_createsuperuser # create Wagtail admin user
+```
+
+**Run a single test file/path:**
+```bash
+docker compose -f local.yml run --rm django pytest path/to/test_file.py
+```
+
+## Architecture & Data Pipeline
+
+### Apps
+
+| App | Purpose |
+|---|---|
+| `log_manager` | Log file discovery, validation, and status tracking |
+| `log_manager_config` | Collection-specific configuration (paths, emails, expected logs/day) |
+| `metrics` | Daily metric jobs, OpenSearch export, COUNTER R5.1 aggregation |
+| `document` | Unified metadata model for articles, books, chapters, datasets, and preprints |
+| `source` | Journal, book, preprint server, and data repository metadata |
+| `reports` | Weekly, monthly, and yearly log processing reports |
+| `resources` | Robot user-agent patterns and GeoIP MMDB management |
+| `tracker` | Discarded line tracking and error logging |
+| `core` | Wagtail pages, users, shared utilities, and external API collectors |
+| `collection` | SciELO collection management |
+
+### Core Collectors (`core/collectors/`)
+
+| Collector | Source |
+|---|---|
+| `articlemeta.py` | ArticleMeta REST/Thrift API |
+| `opac.py` | SciELO OPAC endpoint |
+| `preprints.py` | SciELO Preprints OAI-PMH |
+| `dataverse.py` | SciELO Data (Dataverse) |
+| `scielo_books.py` | SciELO Books CouchDB changes feed |
+
+### Log Ingestion Pipeline
+
+The ingestion is fully automated via the **`[Log Pipeline] Daily Routine (Auto)`** task. It follows a strictly ordered sequence using Celery Chords:
+
+- **Search**: Scans configured directories for new `.log` or `.gz` files.
+- **Validate**: Performs statistical sampling to ensure log integrity and detect the usage date.
+- **Parse**: Extracts metrics using `scielo_usage_counter`, performs URL translation, and aggregates data.
+- **Export**: Pushes results to OpenSearch using idempotent upsert scripts.
+
+### Metadata Synchronization
+
+Metadata is kept in sync with SciELO sources (ArticleMeta, OPAC, Books, etc.) via the **`[Metadata] Daily Sync Routine (Auto)`** task, which runs parallel workers to ensure documents and sources are always up to date.
+
+## Supported Log Formats
+
+| Format | Description |
+|---|---|
+| NCSA Extended | Standard Apache combined log format with optional domain prefix and IP list fields. |
+| BunnyCDN | Pipe-delimited format with Unix timestamps (7 or 10 digits), country codes, and request IDs. |
+
+## Environment Variables
+
+| Variable | Default | Description |
+|---|---|---|
+| `OPENSEARCH_URL` | — | OpenSearch cluster URL |
+| `OPENSEARCH_BASIC_AUTH` | — | OpenSearch basic auth credentials (`user:pass`) |
+| `OPENSEARCH_VERIFY_CERTS` | `False` | Verify SSL certificates for OpenSearch connections |
+| `USE_LOCAL_SCIELO_LIBS` | `0` | Mount local `scielo_log_validator` and `scielo_usage_counter` repos for development |
+| `DJANGO_SETTINGS_MODULE` | `config.settings.local` | Django settings module |
+| `REDIS_URL` | — | Redis connection URL for Celery |
+
+## OpenSearch Storage Strategy (Hybrid Monthly)
+
+To optimize storage and performance, this system employs a **Hybrid Granularity** approach in OpenSearch:
+
+- **Monthly Partitioning**: Indices are partitioned by month (e.g., `usage_monthly_books_2026`).
+- **One Document per Month**: Each article/PID has exactly **one document per month**, drastically reducing the total document count (up to 30x reduction).
+- **Daily Nested Metrics**: Daily granularity is preserved inside each monthly document using a `daily_metrics` object.
+- **Atomic Upserts**: Data is merged using OpenSearch **Painless Scripts**, allowing multiple logs for the same day/month to be processed without data duplication or loss.
+
+## Management & Monitoring
+
+All pipelines can be monitored through the **Wagtail Admin**:
+
+- **Log Manager**: Monitor the status of individual log files (`QUEUED`, `PARSING`, `PROCESSED`).
+- **Daily Metric Jobs**: Track the history of daily processing and OpenSearch export attempts.
+- **Log Config**: Manage collection-specific settings, log paths, and notification emails.
+
+### Useful Commands
+
+- `make django_shell`: Access the Django interactive shell.
+- `docker logs -f scielo_usage_local_celeryworker`: Monitor real-time task execution.
+
+## Dependencies
+
+- [scielo_log_validator](https://github.com/scieloorg/scielo_log_validator) — log file validation
+- [scielo_usage_counter](https://github.com/scieloorg/scielo_usage_counter) — COUNTER R5.1 metrics extraction
+- [device_detector](https://github.com/thinkwelltwd/device_detector) — client name/version detection
+- [opensearch-py](https://github.com/opensearch-project/opensearch-py) — OpenSearch client
diff --git a/VERSION b/VERSION
index 850e742..227cea2 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.14.0
+2.0.0
diff --git a/article/admin.py b/article/admin.py
deleted file mode 100644
index 8c38f3f..0000000
--- a/article/admin.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from django.contrib import admin
-
-# Register your models here.
diff --git a/article/management/commands/load_articles_by_year.py b/article/management/commands/load_articles_by_year.py
deleted file mode 100644
index 335598e..0000000
--- a/article/management/commands/load_articles_by_year.py
+++ /dev/null
@@ -1,80 +0,0 @@
-from django.core.management.base import BaseCommand
-
-from article.tasks import task_load_article_from_opac, task_load_article_from_article_meta
-
-
-class Command(BaseCommand):
- help = 'Generate task requests for loading article data from Article Meta for each year from 1900 to 2025'
-
- def add_arguments(self, parser):
- parser.add_argument(
- '--start-year',
- type=int,
- default=1990,
- help='Start year (default: 1990)'
- )
- parser.add_argument(
- '--end-year',
- type=int,
- default=2025,
- help='End year (default: 2025)'
- )
- parser.add_argument(
- '--collection',
- type=str,
- default='scl',
- help='Collection code (default: scl)'
- )
- parser.add_argument(
- '--task',
- choices=['load_article_from_opac', 'load_article_from_article_meta'],
- default='load_article_from_opac',
- help='Task to execute (default: load_article_from_opac)',
- )
-
- def handle(self, *args, **options):
- start_year = options['start_year']
- end_year = options['end_year']
- collection = options['collection']
-
- self.stdout.write(
- self.style.SUCCESS(
- f'Generating task requests from {start_year} to {end_year} for collection: {collection}'
- )
- )
-
- total_tasks = 0
-
- for year in range(start_year, end_year + 1):
- from_date = f'{year}-01-01'
- until_date = f'{year}-12-31'
-
- self.stdout.write(f'Queuing task for year {year}...')
-
- # Queue the task for each year
- if options['task'] == 'load_article_from_article_meta':
- task_result = task_load_article_from_article_meta.delay(
- from_date=from_date,
- until_date=until_date,
- collection=collection
- )
- else:
- task_result = task_load_article_from_opac.delay(
- from_date=from_date,
- until_date=until_date,
- collection=collection
- )
-
- total_tasks += 1
-
- self.stdout.write(
- self.style.SUCCESS(
- f'✓ Task queued for year {year}: {from_date} to {until_date} (Task ID: {task_result.id})'
- )
- )
-
- self.stdout.write(
- self.style.SUCCESS(
- f'\nCompleted! {total_tasks} tasks have been queued successfully.'
- )
- )
diff --git a/article/migrations/0001_initial.py b/article/migrations/0001_initial.py
deleted file mode 100644
index 816d61e..0000000
--- a/article/migrations/0001_initial.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Generated by Django 5.0.7 on 2025-02-07 17:50
-
-import django.db.models.deletion
-from django.conf import settings
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- initial = True
-
- dependencies = [
- ("collection", "0001_initial"),
- migrations.swappable_dependency(settings.AUTH_USER_MODEL),
- ]
-
- operations = [
- migrations.CreateModel(
- name="Article",
- fields=[
- (
- "id",
- models.BigAutoField(
- auto_created=True,
- primary_key=True,
- serialize=False,
- verbose_name="ID",
- ),
- ),
- (
- "created",
- models.DateTimeField(
- auto_now_add=True, verbose_name="Creation date"
- ),
- ),
- (
- "updated",
- models.DateTimeField(
- auto_now=True, verbose_name="Last update date"
- ),
- ),
- (
- "scielo_issn",
- models.CharField(
- db_index=True, max_length=9, verbose_name="SciELO ISSN"
- ),
- ),
- (
- "pid_v2",
- models.CharField(
- db_index=True, max_length=23, verbose_name="PID V2"
- ),
- ),
- (
- "pid_v3",
- models.CharField(
- blank=True,
- db_index=True,
- max_length=23,
- null=True,
- verbose_name="PID V3",
- ),
- ),
- (
- "pdfs",
- models.JSONField(
- blank=True,
- default=dict,
- null=True,
- verbose_name="Format with Language",
- ),
- ),
- (
- "default_lang",
- models.CharField(max_length=2, verbose_name="Default Language"),
- ),
- (
- "text_langs",
- models.JSONField(
- blank=True,
- default=dict,
- null=True,
- verbose_name="Text Languages",
- ),
- ),
- (
- "processing_date",
- models.CharField(max_length=32, verbose_name="Processing Date"),
- ),
- (
- "publication_date",
- models.CharField(max_length=32, verbose_name="Publication Date"),
- ),
- (
- "publication_year",
- models.CharField(
- db_index=True, max_length=4, verbose_name="Publication Year"
- ),
- ),
- (
- "collection",
- models.ForeignKey(
- on_delete=django.db.models.deletion.CASCADE,
- to="collection.collection",
- verbose_name="Collection",
- ),
- ),
- (
- "creator",
- models.ForeignKey(
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_creator",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Creator",
- ),
- ),
- (
- "updated_by",
- models.ForeignKey(
- blank=True,
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_last_mod_user",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Updater",
- ),
- ),
- ],
- options={
- "verbose_name": "Article",
- "verbose_name_plural": "Articles",
- "unique_together": {("collection", "scielo_issn", "pid_v2", "pid_v3")},
- },
- ),
- ]
diff --git a/article/migrations/0002_alter_article_unique_together_article_files_and_more.py b/article/migrations/0002_alter_article_unique_together_article_files_and_more.py
deleted file mode 100644
index cee055c..0000000
--- a/article/migrations/0002_alter_article_unique_together_article_files_and_more.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Generated by Django 5.0.7 on 2025-04-01 01:09
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("article", "0001_initial"),
- ("collection", "0001_initial"),
- ]
-
- operations = [
- migrations.AddField(
- model_name="article",
- name="files",
- field=models.JSONField(
- blank=True, default=dict, null=True, verbose_name="Files"
- ),
- ),
- migrations.AddField(
- model_name="article",
- name="pid_generic",
- field=models.CharField(
- blank=True,
- db_index=True,
- max_length=50,
- null=True,
- verbose_name="PID Generic",
- ),
- ),
- migrations.RemoveField(
- model_name="article",
- name="pdfs",
- ),
- migrations.AlterUniqueTogether(
- name="article",
- unique_together={
- ("collection", "scielo_issn", "pid_v2", "pid_v3", "pid_generic")
- },
- ),
-
- ]
diff --git a/article/migrations/0003_article_collection_scielo_issn_idx.py b/article/migrations/0003_article_collection_scielo_issn_idx.py
deleted file mode 100644
index 753ac98..0000000
--- a/article/migrations/0003_article_collection_scielo_issn_idx.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Generated by Django 5.0.7 on 2025-06-12 17:16
-
-from django.conf import settings
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("article", "0002_alter_article_unique_together_article_files_and_more"),
- ("collection", "0001_initial"),
- migrations.swappable_dependency(settings.AUTH_USER_MODEL),
- ]
-
- operations = [
- migrations.AddIndex(
- model_name="article",
- index=models.Index(
- fields=["collection", "scielo_issn"], name="collection_scielo_issn_idx"
- ),
- ),
- ]
diff --git a/article/migrations/__init__.py b/article/migrations/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/article/models.py b/article/models.py
deleted file mode 100644
index 80d2a97..0000000
--- a/article/models.py
+++ /dev/null
@@ -1,143 +0,0 @@
-from django.db import models
-from django.utils.translation import gettext_lazy as _
-
-from core.models import CommonControlField
-from collection.models import Collection
-
-
-class Article(CommonControlField):
- collection = models.ForeignKey(
- Collection,
- verbose_name=_('Collection'),
- on_delete=models.CASCADE,
- blank=False,
- null=False,
- db_index=True,
- )
-
- scielo_issn = models.CharField(
- verbose_name=_('SciELO ISSN'),
- max_length=9,
- blank=False,
- null=False,
- db_index=True,
- )
-
- pid_v2 = models.CharField(
- verbose_name=_('PID V2'),
- max_length=23,
- blank=False,
- null=False,
- db_index=True,
- )
-
- pid_v3 = models.CharField(
- verbose_name=_('PID V3'),
- max_length=23,
- blank=True,
- null=True,
- db_index=True,
- )
-
- pid_generic = models.CharField(
- verbose_name=_('PID Generic'),
- max_length=50,
- blank=True,
- null=True,
- db_index=True,
- )
-
- files = models.JSONField(
- verbose_name=_('Files'),
- null=True,
- blank=True,
- default=dict,
- )
-
- default_lang = models.CharField(
- verbose_name=_('Default Language'),
- max_length=2,
- blank=False,
- null=False,
- )
-
- text_langs = models.JSONField(
- verbose_name=_('Text Languages'),
- null=True,
- blank=True,
- default=dict,
- )
-
- processing_date = models.CharField(
- verbose_name=_('Processing Date'),
- max_length=32,
- null=False,
- blank=False,
- )
-
- publication_date = models.CharField(
- verbose_name=_('Publication Date'),
- max_length=32,
- null=False,
- blank=False,
- )
-
- publication_year = models.CharField(
- verbose_name=_('Publication Year'),
- max_length=4,
- null=False,
- blank=False,
- db_index=True,
- )
-
- def __str__(self):
- return f'{self.collection.acron3} - {self.scielo_issn} - {self.pid_v2 or self.pid_v3 or self.pid_generic}'
-
- @classmethod
- def metadata(cls, collection=None):
- qs = cls.objects.select_related('collection').only(
- 'collection__acron3',
- 'default_lang',
- 'files',
- 'pid_v2',
- 'pid_v3',
- 'pid_generic',
- 'processing_date',
- 'publication_date',
- 'publication_year',
- 'scielo_issn',
- 'text_langs',
- )
-
- if collection:
- qs = qs.filter(collection=collection)
-
- for a in qs.iterator():
- yield {
- 'collection': a.collection.acron3,
- 'default_lang': a.default_lang,
- 'files': a.files,
- 'pid_v2': a.pid_v2,
- 'pid_v3': a.pid_v3,
- 'pid_generic': a.pid_generic,
- 'processing_date': a.processing_date,
- 'publication_date': a.publication_date,
- 'publication_year': a.publication_year,
- 'scielo_issn': a.scielo_issn,
- 'text_langs': a.text_langs,
- }
-
- class Meta:
- verbose_name = _('Article')
- verbose_name_plural = _('Articles')
- unique_together = (
- 'collection',
- 'scielo_issn',
- 'pid_v2',
- 'pid_v3',
- 'pid_generic',
- )
- indexes = [
- models.Index(fields=['collection', 'scielo_issn'], name='collection_scielo_issn_idx'),
- ]
-
diff --git a/article/tasks.py b/article/tasks.py
deleted file mode 100644
index 3514fca..0000000
--- a/article/tasks.py
+++ /dev/null
@@ -1,259 +0,0 @@
-import logging
-
-from django.contrib.auth import get_user_model
-from django.db.models import Q
-from django.db import DataError
-from django.utils.translation import gettext as _
-
-from collection.models import Collection
-from config import celery_app
-from core.utils import date_utils
-from core.utils.utils import _get_user
-
-from journal.models import Journal
-
-from tracker.models import ArticleEvent
-from tracker.choices import ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED, ARTICLE_EVENT_TYPE_DATA_ERROR
-
-from . import models, utils
-
-
-User = get_user_model()
-
-@celery_app.task(bind=True, name=_('Load article data from Article Meta'), timelimit=-1, queue='load')
-def task_load_article_from_article_meta(self, from_date=None, until_date=None, days_to_go_back=None, collection=None, issn=None, force_update=True, user_id=None, username=None):
- user = _get_user(self.request, username=username, user_id=user_id)
-
- from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back)
- logging.info(f'Loading articles from Article Meta. From: {from_date}, Until: {until_date}, Collection: {collection}, ISSN: {issn}.')
-
- offset = 0
- limit = 1000
- while True:
- logging.info(f'{from_date}, {until_date}, {offset}, {limit}, {collection}, {issn}')
- response = utils.fetch_article_meta_dict(from_date, until_date, offset=offset, limit=limit, collection=collection, issn=issn)
- objects = response.get('objects')
- if not objects:
- break
-
- for obj in objects:
- codes = obj.get('code_title')
-
- for issn_code in codes:
- jou = Journal.objects.filter(
- Q(issns__electronic_issn=issn_code) |
- Q(issns__scielo_issn=issn_code) |
- Q(issns__print_issn=issn_code)
- ).first()
- if not jou:
- continue
-
- if not jou:
- logging.info(f'Journal not found for ISSNs: {codes}')
- continue
-
- col_obj = Collection.objects.get(acron3=obj.get('collection'))
- if not col_obj:
- logging.info(f'Collection not found: {obj.get("collection")}')
- continue
-
- try:
- article, created = models.Article.objects.get_or_create(collection=col_obj, scielo_issn=jou.scielo_issn, pid_v2=obj.get('code'))
- if created or force_update:
- article.files = obj.get('pdfs') or {}
- article.processing_date = obj.get('processing_date') or ''
- article.publication_date = obj.get('publication_date') or ''
- article.publication_year = obj.get('publication_year') or ''
- article.default_lang = obj.get('default_language') or ''
- article.text_langs = obj.get('text_langs') or ''
-
- article.save()
- logging.info(f'Article {"created" if created else "updated"}: {article}')
- except models.Article.MultipleObjectsReturned as e:
- logging.error(f'Error getting Article: {e}. Collection: {col_obj}, ISSN: {jou.scielo_issn}, PIDv2: {obj.get("code")}')
- ArticleEvent.create(
- event_type=ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED,
- message=f'Error getting Article: {e}. Collection: {col_obj}, ISSN: {jou.scielo_issn}, PIDv2: {obj.get("code")}',
- data=obj
- )
- continue
- except DataError as e:
- logging.error(f'Error saving Article: {e}. Collection: {col_obj}, ISSN: {jou.scielo_issn}, PIDv2: {obj.get("code")}')
- ArticleEvent.create(
- event_type=ARTICLE_EVENT_TYPE_DATA_ERROR,
- message=f'Error saving Article: {e}. Collection: {col_obj}, ISSN: {jou.scielo_issn}, PIDv2: {obj.get("code")}',
- data=obj
- )
- continue
-
- offset += limit
-
- return True
-
-
-@celery_app.task(bind=True, name=_('Load article data from OPAC'), timelimit=-1, queue='load')
-def task_load_article_from_opac(self, collection='scl', from_date=None, until_date=None, days_to_go_back=None, page=1, force_update=True, user_id=None, username=None):
- user = _get_user(self.request, username=username, user_id=user_id)
-
- from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back)
- logging.info(f'Loading articles from OPAC. From: {from_date}, Until: {until_date}')
-
- while True:
- response = utils.fetch_opac_dict(from_date, until_date, page=page)
-
- documents = response.get('documents')
-
- for doc_id, doc in documents.items():
- col_obj = Collection.objects.get(acron3=collection)
- if not col_obj:
- logging.error(f'Collection not found: {collection}')
- continue
-
- journal = Journal.objects.get(collection=col_obj, acronym=doc.get('journal_acronym'))
- if not journal:
- logging.error(f'Journal not found: {doc.get("journal_acronym")}')
- continue
-
- try:
- article, created = models.Article.objects.get_or_create(collection=col_obj, scielo_issn=journal.scielo_issn, pid_v2=doc.get('pid_v2'))
-
- if created or force_update:
- article.pid_v3 = doc.get('pid_v3') or ''
- if not created:
- article.pid_v2 = doc.get('pid_v2') or ''
- article.publication_date = doc.get('publication_date') or article.publication_date or ''
- article.default_lang = doc.get('default_language') or article.default_lang or ''
-
- try:
- article.publication_year = article.publication_date[:4]
- except IndexError:
- article.publication_year = ''
-
- article.save()
- logging.info(f'Article {"created" if created else "updated"}: {article}')
-
- except models.Article.MultipleObjectsReturned as e:
- logging.error(f'Error getting Article: {e}. Collection: {col_obj}, Journal: {journal.scielo_issn}, PIDv2: {doc.get("pid_v2")}')
- ArticleEvent.create(
- event_type=ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED,
- message=f'Error creating Article: {e}. Collection: {col_obj}, Journal: {journal.scielo_issn}, PIDv2: {doc.get("pid_v2")}',
- data=doc
- )
- continue
- except DataError as e:
- logging.error(f'Error saving Article: {e}. Collection: {col_obj}, Journal: {journal.scielo_issn}, PIDv2: {doc.get("pid_v2")}')
- ArticleEvent.create(
- event_type=ARTICLE_EVENT_TYPE_DATA_ERROR,
- message=f'Error saving Article: {e}. Collection: {col_obj}, Journal: {journal.scielo_issn}, PIDv2: {doc.get("pid_v2")}',
- data=doc
- )
- continue
-
- page += 1
- if page > int(response.get('pages', 0)):
- break
-
- return True
-
-
-@celery_app.task(bind=True, name=_('Load preprint data from SciELO Preprints'), timelimit=-1, queue='load')
-def task_load_preprints_from_preprints_api(self, from_date=None, until_date=None, days_to_go_back=None, force_update=True, user_id=None, username=None):
- user = _get_user(self.request, username=username, user_id=user_id)
-
- from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back)
- logging.info(f'Loading preprints from SciELO Preprints. From: {from_date}, Until: {until_date}')
-
- col_obj = Collection.objects.get(acron3='preprints')
- if not col_obj:
- logging.error(f'Collection not found: preprints')
- return False
-
- for record in utils.fetch_preprint_oai_pmh(from_date, until_date):
- data = utils.extract_preprint_data(record)
-
- if not data.get('pid_generic'):
- logging.error(f'Preprint ID not found in record: {record}')
- continue
-
- try:
- article, created = models.Article.objects.get_or_create(collection=col_obj, pid_generic=data['pid_generic'])
- if created or force_update:
- article.text_langs = data.get('text_langs')
- article.default_lang = data.get('default_language')
- article.publication_date = data.get('publication_date')
- article.publication_year = data.get('publication_year')
-
- # Preprints do not have a scielo_issn yet
- article.scielo_issn = '0000-0000'
-
- article.save()
- logging.debug(f'Article {"created" if created else "updated"}: {article}')
- except models.Article.MultipleObjectsReturned as e:
- logging.error(f'Error creating Article: {e}. Collection: {col_obj}, PID: {data["pid_generic"]}')
- ArticleEvent.create(
- event_type=ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED,
- message=f'Error creating Article: {e}. Collection: {col_obj}, PID: {data["pid_generic"]}',
- data=data
- )
- continue
- except DataError as e:
- logging.error(f'Error saving Article: {e}. Collection: {col_obj}, PID: {data["pid_generic"]}')
- ArticleEvent.create(
- event_type=ARTICLE_EVENT_TYPE_DATA_ERROR,
- message=f'Error saving Article: {e}. Collection: {col_obj}, PID: {data["pid_generic"]}',
- data=data
- )
- continue
-
-
-@celery_app.task(bind=True, name=_('Load dataset metadata from Dataverse'), timelimit=-1, queue='load')
-def task_load_dataset_metadata_from_dataverse(self, from_date=None, until_date=None, days_to_go_back=None, force_update=True, user_id=None, username=None):
- user = _get_user(self.request, username=username, user_id=user_id)
-
- from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back)
- logging.info(f'Loading dataset metadata from SciELO Data. From: {from_date}, Until: {until_date}')
-
- col_obj = Collection.objects.get(acron3='data')
- if not col_obj:
- logging.error(f'Collection not found: data')
- return False
-
- for record in utils.fetch_dataverse_metadata(from_date, until_date):
- dataset_doi = record.get('dataset_doi')
- if not dataset_doi:
- logging.error(f'Dataset DOI not found in record: {record}')
- continue
-
- try:
- dataset, created = models.Article.objects.get_or_create(collection=col_obj, pid_generic=dataset_doi)
- if created or force_update:
- dataset.publication_date = record.get('dataset_published')
-
- file_persistent_id = record.get('file_persistent_id')
- file_id = record.get('file_id')
- file_name = record.get('file_name')
- file_url = record.get('file_url')
-
- if file_id:
- dataset.files[file_id] = {'name': file_name, 'url': file_url, 'file_persisent_id': file_persistent_id}
-
- dataset.save()
- logging.debug(f'Dataset {"created" if created else "updated"}: {dataset}')
- except models.Article.MultipleObjectsReturned as e:
- logging.error(f'Error creating Dataset: {e}. Collection: {col_obj}, PID: {dataset_doi}')
- ArticleEvent.create(
- event_type=ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED,
- message=f'Error creating Dataset: {e}. Collection: {col_obj}, PID: {dataset_doi}',
- data=record
- )
- continue
- except DataError as e:
- logging.error(f'Error saving Dataset: {e}. Collection: {col_obj}, PID: {dataset_doi}')
- ArticleEvent.create(
- event_type=ARTICLE_EVENT_TYPE_DATA_ERROR,
- message=f'Error saving Dataset: {e}. Collection: {col_obj}, PID: {dataset_doi}',
- data=record
- )
- continue
-
- return True
diff --git a/article/tests.py b/article/tests.py
deleted file mode 100644
index 7ce503c..0000000
--- a/article/tests.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from django.test import TestCase
-
-# Create your tests here.
diff --git a/article/utils.py b/article/utils.py
deleted file mode 100644
index b9a094e..0000000
--- a/article/utils.py
+++ /dev/null
@@ -1,204 +0,0 @@
-import logging
-import requests
-import os
-
-from sickle import Sickle
-from time import sleep
-
-from core.utils import standardizer
-
-
-ARTICLEMETA_ENDPOINT = os.environ.get('ARTICLEMETA_COLLECT_URL', 'http://articlemeta.scielo.org/api/v1/article/counter_dict')
-ARTICLEMETA_MAX_RETRIES = int(os.environ.get('ARTICLEMETA_MAX_RETRIES', 5))
-ARTICLEMETA_SLEEP_TIME = int(os.environ.get('ARTICLEMETA_SLEEP_TIME', 30))
-
-OPAC_ENDPOINT = os.environ.get('OPAC_ENDPOINT', 'https://www.scielo.br/api/v1/counter_dict')
-OPAC_MAX_RETRIES = int(os.environ.get('OPAC_MAX_RETRIES', 5))
-OPAC_SLEEP_TIME = int(os.environ.get('OPAC_SLEEP_TIME', 30))
-
-OAI_PMH_PREPRINT_ENDPOINT = os.environ.get('OAI_PMH_PREPRINT_ENDPOINT', 'https://preprints.scielo.org/index.php/scielo/oai')
-OAI_METADATA_PREFIX = os.environ.get('OAI_METADATA_PREFIX', 'oai_dc')
-OAI_PMH_MAX_RETRIES = int(os.environ.get('OAI_PMH_MAX_RETRIES', 5))
-
-DATAVERSE_ENDPOINT = os.environ.get('DATAVERSE_ENDPOINT', 'https://data.scielo.org/api')
-DATAVERSE_ROOT_COLLECTION = os.environ.get('DATAVERSE_ROOT_COLLECTION', 'scielodata')
-DATAVERSE_MAX_RETRIES = int(os.environ.get('DATAVERSE_MAX_RETRIES', 5))
-DATAVERSE_SLEEP_TIME = int(os.environ.get('DATAVERSE_SLEEP_TIME', 30))
-
-
-def fetch_article_meta_dict(from_date, until_date, offset=0, limit=1000, collection=None, issn=None):
- for t in range(1, ARTICLEMETA_MAX_RETRIES + 1):
- params = {
- 'from': from_date,
- 'until': until_date,
- 'offset': offset,
- 'limit': limit
- }
-
- if collection:
- params['collection'] = collection
-
- if issn:
- params['issn'] = issn
-
- response = requests.get(ARTICLEMETA_ENDPOINT, params=params)
-
- try:
- response.raise_for_status()
- logging.info(response.url)
-
- except requests.exceptions.HTTPError:
- logging.warning(
- 'Failed to collect data from %s. Waiting %d seconds before retry %d of %d' % (
- response.url,
- ARTICLEMETA_SLEEP_TIME,
- t,
- ARTICLEMETA_MAX_RETRIES
- )
- )
- sleep(ARTICLEMETA_SLEEP_TIME)
-
- else:
- return response.json()
-
-
-def fetch_opac_dict(from_date, until_date, page=1):
- for t in range(1, OPAC_MAX_RETRIES + 1):
- params = {
- 'begin_date': from_date,
- 'end_date': until_date,
- 'page': page
- }
-
- response = requests.get(url=OPAC_ENDPOINT, params=params, verify=False)
-
- try:
- response.raise_for_status()
- logging.info(response.url)
-
- except requests.exceptions.HTTPError:
- logging.warning('Não foi possível coletar dados de %s. Aguardando %d segundos para tentativa %d de %d' % (response.url, OPAC_SLEEP_TIME, t, OPAC_MAX_RETRIES))
- sleep(OPAC_SLEEP_TIME)
-
- else:
- return response.json()
-
-
-def fetch_preprint_oai_pmh(from_date, until_date):
- oai_client = Sickle(endpoint=OAI_PMH_PREPRINT_ENDPOINT, max_retries=OAI_PMH_MAX_RETRIES, verify=False)
- records = oai_client.ListRecords(**{
- 'metadataPrefix': OAI_METADATA_PREFIX,
- 'from': from_date,
- 'until': until_date,
- })
-
- for r in records:
- yield r
-
-
-def extract_preprint_data(record):
- pid_generic = _extract_preprint_compatible_identifer(record.header.identifier)
- text_langs = [standardizer.standardize_language_code(l) for l in record.metadata.get('language', [])]
- publication_date = record.metadata.get('date', [''])[0]
- default_language = text_langs[0] if text_langs else ''
- publication_year = _extract_preprint_publication_year_from_date(publication_date)
-
- data = {
- 'pid_generic': pid_generic,
- 'text_langs': text_langs,
- 'publication_date': publication_date,
- 'default_language': default_language,
- 'publication_year': publication_year
- }
-
- return data
-
-
-def _extract_preprint_compatible_identifer(pid_v2):
- try:
- # piv_v2 should be something like oai:ops.preprints.scielo.org:preprint/1195
- # we are using the last part of the string as the identifier
- return pid_v2.split(':')[-1].split('/')[1]
- except IndexError:
- return ''
-
-
-def _extract_preprint_publication_year_from_date(date_str):
- try:
- return date_str[:4]
- except IndexError:
- return ''
-
-
-def fetch_dataverse_metadata(from_date=None, until_date=None):
- def get_subdataverses():
- url = f"{DATAVERSE_ENDPOINT}/dataverses/{DATAVERSE_ROOT_COLLECTION}/contents"
- try:
- response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME)
- response.raise_for_status()
- return response.json().get("data", [])
- except requests.exceptions.RequestException as e:
- logging.error(f"Error fetching subdataverses: {e}")
- return []
-
- def get_datasets(subdataverse_id):
- url = f"{DATAVERSE_ENDPOINT}/dataverses/{subdataverse_id}/contents"
- try:
- response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME)
- response.raise_for_status()
- return response.json().get("data", [])
- except requests.exceptions.RequestException as e:
- logging.error(f"Error fetching datasets for subdataverse {subdataverse_id}: {e}")
- return []
-
- def get_files(dataset_id):
- url = f"{DATAVERSE_ENDPOINT}/datasets/{dataset_id}/versions/:latest/files"
- try:
- response = requests.get(url, timeout=DATAVERSE_SLEEP_TIME)
- response.raise_for_status()
- return response.json().get("data", [])
- except requests.exceptions.RequestException as e:
- logging.error(f"Error fetching files for dataset {dataset_id}: {e}")
- return []
-
- subdataverses = get_subdataverses()
-
- for subdataverse in subdataverses:
- if subdataverse["type"] != "dataverse":
- continue
-
- subdataverse_id = subdataverse["id"]
- subdataverse_title = subdataverse["title"]
- datasets = get_datasets(subdataverse_id)
-
- for dataset in datasets:
- if dataset["type"] != "dataset":
- continue
-
- dataset_id = dataset["id"]
- doi = standardizer.standardize_doi(dataset.get("persistentUrl"))
- if not doi:
- logging.warning(f"Dataset {dataset_id} does not have a DOI.")
- continue
-
- publication_date = dataset.get("publicationDate", None)
-
- if publication_date:
- if (from_date and publication_date < from_date) or (until_date and publication_date > until_date):
- continue
-
- files = get_files(dataset_id)
-
- for file in files:
- file_persistent_id = file["dataFile"].get("persistentId", None)
- file_persistent_id_stz = standardizer.standardize_pid_generic(file_persistent_id) if file_persistent_id else None
-
- yield {
- "title": subdataverse_title,
- "dataset_doi": doi,
- "dataset_published": publication_date,
- "file_id": file["dataFile"]["id"],
- "file_name": file["label"],
- "file_url": f"{DATAVERSE_ENDPOINT}/access/datafile/{file['dataFile']['id']}",
- "file_persistent_id": file_persistent_id_stz,
- }
diff --git a/article/views.py b/article/views.py
deleted file mode 100644
index 91ea44a..0000000
--- a/article/views.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from django.shortcuts import render
-
-# Create your views here.
diff --git a/collection/models.py b/collection/models.py
index f2d7ce7..87da123 100644
--- a/collection/models.py
+++ b/collection/models.py
@@ -9,7 +9,7 @@
from core.forms import CoreAdminModelForm
from core.models import CommonControlField, Language, TextWithLang
-from core.utils.utils import fetch_data
+from core.utils.request_utils import fetch_data
from . import choices
diff --git a/collection/tasks.py b/collection/tasks.py
index 02fd0e7..19372de 100644
--- a/collection/tasks.py
+++ b/collection/tasks.py
@@ -1,14 +1,14 @@
from django.contrib.auth import get_user_model
from django.utils.translation import gettext as _
-from core.utils.utils import _get_user
+from core.utils.request_utils import _get_user
from collection.models import Collection
from config import celery_app
User = get_user_model()
-@celery_app.task(bind=True, name=_('Load collection data'))
+@celery_app.task(bind=True, name=_('[Collection] Load Collection Data'))
def task_load_collections(self, user_id=None, username=None):
user = _get_user(self.request, username=username, user_id=user_id)
Collection.load(user)
diff --git a/collection/wagtail_hooks.py b/collection/wagtail_hooks.py
index e7b7e97..52b31a8 100644
--- a/collection/wagtail_hooks.py
+++ b/collection/wagtail_hooks.py
@@ -1,8 +1,5 @@
from django.utils.translation import gettext as _
from wagtail.snippets.views.snippets import SnippetViewSet
-from wagtail.snippets.models import register_snippet
-
-from config.menu import get_menu_order
from .models import Collection
@@ -10,10 +7,8 @@
class CollectionSnippetViewSet(SnippetViewSet):
model = Collection
icon = "folder-open-inverse"
- menu_name = 'collection'
menu_label = _("Collection")
- menu_order = get_menu_order("collection")
- add_to_admin_menu = True
+ menu_order = 100
list_display = (
"main_name",
@@ -57,6 +52,3 @@ class CollectionSnippetViewSet(SnippetViewSet):
"updated_by",
)
export_filename = "collections"
-
-
-register_snippet(CollectionSnippetViewSet)
diff --git a/compose/local/django/Dockerfile b/compose/local/django/Dockerfile
index 4351d9e..aac7972 100644
--- a/compose/local/django/Dockerfile
+++ b/compose/local/django/Dockerfile
@@ -23,8 +23,7 @@ COPY ./requirements .
RUN python -m pip install --upgrade pip
# Create Python Dependency and Sub-Dependency Wheels.
-RUN pip wheel --wheel-dir /usr/src/app/wheels \
- -r ${BUILD_ENVIRONMENT}.txt
+RUN pip wheel --wheel-dir /usr/src/app/wheels -r ${BUILD_ENVIRONMENT}.txt
# Python 'run' stage
diff --git a/compose/local/django/celery/worker/start b/compose/local/django/celery/worker/start
index 7db6f27..f0c7efc 100644
--- a/compose/local/django/celery/worker/start
+++ b/compose/local/django/celery/worker/start
@@ -21,4 +21,14 @@ watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concu
# Worker para arg bol cub data ecu per preprints pry rve spa sss sza ury wid
watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small -n worker.parse_small@%h &
-wait
\ No newline at end of file
+# Workers seriais adicionais para backfill paralelo de colecoes pequenas
+watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_1 -n worker.parse_small_1@%h &
+watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_2 -n worker.parse_small_2@%h &
+watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_3 -n worker.parse_small_3@%h &
+watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_4 -n worker.parse_small_4@%h &
+watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_5 -n worker.parse_small_5@%h &
+watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_6 -n worker.parse_small_6@%h &
+watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_7 -n worker.parse_small_7@%h &
+watchgod celery.__main__.main --args -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_8 -n worker.parse_small_8@%h &
+
+wait
diff --git a/compose/production/django/celery/worker/start b/compose/production/django/celery/worker/start
index 4fb112e..6269dd5 100644
--- a/compose/production/django/celery/worker/start
+++ b/compose/production/django/celery/worker/start
@@ -22,4 +22,14 @@ celery -A config.celery_app worker -l INFO --concurrency=1 -Q parse_medium -n wo
# Worker para arg bol cub data ecu per preprints pry rve spa sss sza ury wid (coleções pequenas)
celery -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small -n worker.parse_small@%h &
-wait
\ No newline at end of file
+# Workers seriais adicionais para backfill paralelo de colecoes pequenas
+celery -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_1 -n worker.parse_small_1@%h &
+celery -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_2 -n worker.parse_small_2@%h &
+celery -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_3 -n worker.parse_small_3@%h &
+celery -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_4 -n worker.parse_small_4@%h &
+celery -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_5 -n worker.parse_small_5@%h &
+celery -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_6 -n worker.parse_small_6@%h &
+celery -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_7 -n worker.parse_small_7@%h &
+celery -A config.celery_app worker -l INFO --concurrency=1 -Q parse_small_8 -n worker.parse_small_8@%h &
+
+wait
diff --git a/compose/production/django/entrypoint b/compose/production/django/entrypoint
index 599841e..02470cd 100644
--- a/compose/production/django/entrypoint
+++ b/compose/production/django/entrypoint
@@ -16,6 +16,20 @@ if [ -z "${POSTGRES_USER}" ]; then
fi
export DATABASE_URL="postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}"
+if [ "${USE_LOCAL_SCIELO_LIBS:-0}" = "1" ]; then
+ for path in /app/scielo_log_validator /app/scielo_usage_counter; do
+ if [ ! -f "${path}/setup.py" ] && [ ! -f "${path}/pyproject.toml" ]; then
+ >&2 echo "Local lib path not ready: ${path}"
+ exit 1
+ fi
+ done
+
+ >&2 echo "Installing local SciELO libs from mounted repositories..."
+ pip install --root-user-action=ignore --no-cache-dir --no-build-isolation --no-deps \
+ -e /app/scielo_log_validator \
+ -e /app/scielo_usage_counter
+fi
+
postgres_ready() {
python << END
import sys
diff --git a/config/collections.py b/config/collections.py
new file mode 100644
index 0000000..9aa3efe
--- /dev/null
+++ b/config/collections.py
@@ -0,0 +1,63 @@
+COLLECTION_ACRON3_SIZE_MAP = {
+ "scl": "xlarge",
+ "chl": "large",
+ "col": "large",
+ "mex": "large",
+ "cri": "medium",
+ "esp": "medium",
+ "psi": "medium",
+ "prt": "medium",
+ "ven": "medium",
+ "arg": "small",
+ "bol": "small",
+ "books": "small",
+ "cub": "small",
+ "data": "small",
+ "dom": "small",
+ "ecu": "small",
+ "per": "small",
+ "preprints": "small",
+ "pry": "small",
+ "rve": "small",
+ "spa": "small",
+ "sss": "small",
+ "sza": "small",
+ "ury": "small",
+ "wid": "small",
+}
+
+COLLECTION_SIZE_SAMPLE_MAP = {
+ "small": 1.0,
+ "medium": 0.5,
+ "large": 0.1,
+ "xlarge": 0.1,
+}
+
+LOG_MANAGER_SEED_DATA = [
+ {"acronym": "arg", "directory_name": "Site clássico", "path": "/app/logs/scielo.ar", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"},
+ {"acronym": "bol", "directory_name": "Site clássico", "path": "/app/logs/scielo.bo", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"},
+ {"acronym": "chl", "directory_name": "Site clássico", "path": "/app/logs/scielo.cl", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"},
+ {"acronym": "col", "directory_name": "Site clássico", "path": "/app/logs/scielo.co", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"},
+ {"acronym": "cri", "directory_name": "Site clássico", "path": "/app/logs/scielo.cr", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"},
+ {"acronym": "cub", "directory_name": "Site clássico", "path": "/app/logs/scielo.cu", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"},
+ {"acronym": "data", "directory_name": "Site clássico", "path": "/app/logs/dataverse", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "dataverse"},
+ {"acronym": "dom", "directory_name": "Site novo", "path": "/app/logs/scielo.dom", "quantity": 1, "start_date": "2026-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "opac"},
+ {"acronym": "ecu", "directory_name": "Site clássico", "path": "/app/logs/scielo.ec", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"},
+ {"acronym": "esp", "directory_name": "Site clássico", "path": "/app/logs/scielo.es", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"},
+ {"acronym": "mex", "directory_name": "Site clássico", "path": "/app/logs/scielo.mx", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"},
+ {"acronym": "per", "directory_name": "Site clássico", "path": "/app/logs/scielo.pe", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"},
+ {"acronym": "preprints", "directory_name": "Site clássico", "path": "/app/logs/submission-node01", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "preprints"},
+ {"acronym": "prt", "directory_name": "Site clássico", "path": "/app/logs/scielo.pt", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"},
+ {"acronym": "pry", "directory_name": "Site clássico", "path": "/app/logs/scielo.py", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"},
+ {"acronym": "psi", "directory_name": "Site clássico", "path": "/app/logs/scielo.pepsic", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"},
+ {"acronym": "rve", "directory_name": "Site clássico", "path": "/app/logs/scielo.revenf", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"},
+ {"acronym": "rvt", "directory_name": "Site clássico", "path": "/app/logs/scielo.revtur", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"},
+ {"acronym": "scl", "directory_name": "Site novo", "path": "/app/logs/scielo.br", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "opac"},
+ {"acronym": "spa", "directory_name": "Site novo - versão prévia", "path": "/app/logs/scielo.sp", "quantity": 2, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "opac_alpha"},
+ {"acronym": "sss", "directory_name": "Site clássico", "path": "/app/logs/scielo.ss", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"},
+ {"acronym": "sza", "directory_name": "Site clássico", "path": "/app/logs/scielo.za", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"},
+ {"acronym": "ury", "directory_name": "Site clássico", "path": "/app/logs/scielo.uy", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"},
+ {"acronym": "ven", "directory_name": "Site clássico", "path": "/app/logs/scielo.ve", "quantity": 1, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"},
+ {"acronym": "wid", "directory_name": "Site clássico", "path": "/app/logs/scielo.wi", "quantity": 2, "start_date": "2020-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "classic"},
+ {"acronym": "books", "directory_name": "SciELO Books", "path": "/app/logs/books", "quantity": 1, "start_date": "2012-01-01", "e-mail": "tecnologia@scielo.org", "translator_class": "books"},
+]
diff --git a/config/menu.py b/config/menu.py
index 13371c6..844ce0c 100644
--- a/config/menu.py
+++ b/config/menu.py
@@ -1,13 +1,10 @@
WAGTAIL_MENU_APPS_ORDER = {
- "collection": 100,
- "article": 200,
- "journal": 300,
- "resources": 400,
- "log_manager": 500,
- "log_manager_config": 600,
- "metrics": 700,
- "tasks": 800,
- "unexpected-error": 900,
+ "metadata": 100,
+ "resources": 200,
+ "log_manager": 300,
+ "tracker": 400,
+ "metrics": 500,
+ "tasks": 600,
}
def get_menu_order(app_name):
diff --git a/config/settings/base.py b/config/settings/base.py
index 4e96ed4..e4a99fa 100644
--- a/config/settings/base.py
+++ b/config/settings/base.py
@@ -5,7 +5,8 @@
from pathlib import Path
import environ
-from django.utils.translation import gettext_lazy as _
+
+from config.collections import COLLECTION_ACRON3_SIZE_MAP # noqa: F401
ROOT_DIR = Path(__file__).resolve(strict=True).parent.parent.parent
# core/
@@ -114,14 +115,15 @@
"core.users",
"core_settings",
# Your stuff: custom apps go here
- "article",
"collection",
"core",
- "journal",
+ "document",
"log_manager",
"log_manager_config",
"metrics",
+ "reports",
"resources",
+ "source",
"tracker",
]
@@ -404,36 +406,54 @@
SEARCH_PAGINATION_ITEMS_PER_PAGE = 10
-# Elasticsearch
+# OpenSearch
# ------------------------------------------------------------------------------
-ES_URL = env("ES_URL", default="http://192.168.0.33:9200/")
-ES_INDEX_NAME = env("ES_INDEX_NAME", default="usage")
-ES_API_KEY = env("ES_API_KEY", default="")
-ES_BASIC_AUTH = env("ES_BASIC_AUTH", default=("elastic", "iHktg66E"))
-ES_VERIFY_CERTS = env.bool("ES_VERIFY_CERTS", default=False)
+OPENSEARCH_URL = env("OPENSEARCH_URL", default="http://localhost:9200/")
+OPENSEARCH_INDEX_NAME = env("OPENSEARCH_INDEX_NAME", default="usage")
+OPENSEARCH_API_KEY = env("OPENSEARCH_API_KEY", default="")
+OPENSEARCH_BASIC_AUTH = env(
+ "OPENSEARCH_BASIC_AUTH",
+ default=("admin", "admin"),
+)
+OPENSEARCH_VERIFY_CERTS = env.bool(
+ "OPENSEARCH_VERIFY_CERTS",
+ default=False,
+)
+
+# Collectors configuration
+# ------------------------------------------------------------------------------
+# ArticleMeta
+ARTICLEMETA_COLLECT_URL = env(
+ "ARTICLEMETA_COLLECT_URL",
+ default="http://articlemeta.scielo.org/api/v1/article/counter_dict",
+)
+ARTICLEMETA_MAX_RETRIES = env.int("ARTICLEMETA_MAX_RETRIES", default=5)
+ARTICLEMETA_SLEEP_TIME = env.int("ARTICLEMETA_SLEEP_TIME", default=30)
+
+# Dataverse
+DATAVERSE_ENDPOINT = env("DATAVERSE_ENDPOINT", default="https://data.scielo.org/api")
+DATAVERSE_ROOT_COLLECTION = env("DATAVERSE_ROOT_COLLECTION", default="scielodata")
+DATAVERSE_SLEEP_TIME = env.int("DATAVERSE_SLEEP_TIME", default=30)
+
+# OPAC
+OPAC_ENDPOINT = env("OPAC_ENDPOINT", default="https://www.scielo.br/api/v1/counter_dict")
+OPAC_MAX_RETRIES = env.int("OPAC_MAX_RETRIES", default=5)
+OPAC_SLEEP_TIME = env.int("OPAC_SLEEP_TIME", default=30)
+
+# Preprints
+OAI_PMH_PREPRINT_ENDPOINT = env(
+ "OAI_PMH_PREPRINT_ENDPOINT",
+ default="https://preprints.scielo.org/index.php/scielo/oai",
+)
+OAI_METADATA_PREFIX = env("OAI_METADATA_PREFIX", default="oai_dc")
+OAI_PMH_MAX_RETRIES = env.int("OAI_PMH_MAX_RETRIES", default=5)
+
+# SciELO Books
+SCIELO_BOOKS_BASE_URL = env("SCIELO_BOOKS_BASE_URL", default="http://localhost:5984")
+SCIELO_BOOKS_TIMEOUT = env.int("SCIELO_BOOKS_TIMEOUT", default=60)
+SCIELO_BOOKS_DB_NAME = env("SCIELO_BOOKS_DB_NAME", default="scielobooks_1a")
+SCIELO_BOOKS_LIMIT = env.int("SCIELO_BOOKS_LIMIT", default=1000)
# Collection size categories
# ------------------------------------------------------------------------------
-EXTRA_LARGE_COLLECTIONS = env.list("EXTRA_LARGE_COLLECTIONS", default=["scl"])
-LARGE_COLLECTIONS = env.list("LARGE_COLLECTIONS", default=["chl", "col", "mex"])
-MEDIUM_COLLECTIONS = env.list("MEDIUM_COLLECTIONS", default=["cri", "esp", "psi", "prt", "ven"])
-SMALL_COLLECTIONS = env.list("SMALL_COLLECTIONS", default=["arg", "bol", "cub", "data", "ecu", "per", "preprints", "pry", "rve", "spa", "sss", "sza", "ury", "wid"])
-
-# Collection size mapping
-def _build_collection_size_map():
- """Build mapping of collection acronyms to their size categories."""
- size_map = {}
- size_categories = {
- "xlarge": EXTRA_LARGE_COLLECTIONS,
- "large": LARGE_COLLECTIONS,
- "medium": MEDIUM_COLLECTIONS,
- "small": SMALL_COLLECTIONS,
- }
-
- for size, collections in size_categories.items():
- for acron3 in collections:
- size_map[acron3] = size
-
- return size_map
-
-COLLECTION_ACRON3_SIZE_MAP = _build_collection_size_map()
+SUPPORTED_LOGFILE_EXTENSIONS = env.list("SUPPORTED_LOGFILE_EXTENSIONS", default=[".log", ".gz", ".zip"])
diff --git a/core/collectors/__init__.py b/core/collectors/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/core/collectors/__init__.py
@@ -0,0 +1 @@
+
diff --git a/core/collectors/articlemeta.py b/core/collectors/articlemeta.py
new file mode 100644
index 0000000..7f6ace0
--- /dev/null
+++ b/core/collectors/articlemeta.py
@@ -0,0 +1,60 @@
+import logging
+
+import requests
+from django.conf import settings
+from articlemeta.client import RestfulClient, ThriftClient
+from time import sleep
+
+
+def fetch_article_counter_dict(
+ from_date,
+ until_date,
+ offset=0,
+ limit=1000,
+ collection=None,
+ issn=None,
+):
+ for attempt in range(1, settings.ARTICLEMETA_MAX_RETRIES + 1):
+ params = {
+ "from": from_date,
+ "until": until_date,
+ "offset": offset,
+ "limit": limit,
+ }
+
+ if collection:
+ params["collection"] = collection
+
+ if issn:
+ params["issn"] = issn
+
+ response = requests.get(settings.ARTICLEMETA_COLLECT_URL, params=params)
+
+ try:
+ response.raise_for_status()
+ logging.info(response.url)
+ except requests.exceptions.HTTPError:
+ logging.warning(
+ "Failed to collect data from %s. Waiting %d seconds before retry %d of %d",
+ response.url,
+ settings.ARTICLEMETA_SLEEP_TIME,
+ attempt,
+ settings.ARTICLEMETA_MAX_RETRIES,
+ )
+ sleep(settings.ARTICLEMETA_SLEEP_TIME)
+ else:
+ return response.json()
+
+ return {}
+
+
+def iter_journals(collection="scl", mode="rest"):
+ if mode == "rest":
+ client = RestfulClient()
+ elif mode == "thrift":
+ client = ThriftClient()
+ else:
+ raise ValueError(f"Unsupported ArticleMeta mode: {mode}")
+
+ for journal in client.journals(collection=collection):
+ yield journal
diff --git a/core/collectors/dataverse.py b/core/collectors/dataverse.py
new file mode 100644
index 0000000..ca51fd7
--- /dev/null
+++ b/core/collectors/dataverse.py
@@ -0,0 +1,75 @@
+import logging
+
+import requests
+from django.conf import settings
+
+from core.utils import standardizer
+
+
+def _request_json(url):
+ try:
+ response = requests.get(url, timeout=settings.DATAVERSE_SLEEP_TIME)
+ response.raise_for_status()
+ return response.json()
+ except requests.exceptions.RequestException as exc:
+ logging.error("Error fetching %s: %s", url, exc)
+ return {}
+
+
+def _get_subdataverses():
+ url = f"{settings.DATAVERSE_ENDPOINT}/dataverses/{settings.DATAVERSE_ROOT_COLLECTION}/contents"
+ return _request_json(url).get("data", [])
+
+
+def _get_datasets(subdataverse_id):
+ url = f"{settings.DATAVERSE_ENDPOINT}/dataverses/{subdataverse_id}/contents"
+ return _request_json(url).get("data", [])
+
+
+def _get_files(dataset_id):
+ url = f"{settings.DATAVERSE_ENDPOINT}/datasets/{dataset_id}/versions/:latest/files"
+ return _request_json(url).get("data", [])
+
+
+def iter_dataset_metadata(from_date=None, until_date=None):
+ for subdataverse in _get_subdataverses():
+ if subdataverse.get("type") != "dataverse":
+ continue
+
+ subdataverse_id = subdataverse["id"]
+ subdataverse_title = subdataverse["title"]
+
+ for dataset in _get_datasets(subdataverse_id):
+ if dataset.get("type") != "dataset":
+ continue
+
+ dataset_id = dataset["id"]
+ doi = standardizer.standardize_doi(dataset.get("persistentUrl"))
+ if not doi:
+ logging.warning("Dataset %s does not have a DOI.", dataset_id)
+ continue
+
+ publication_date = dataset.get("publicationDate")
+ if publication_date:
+ if (from_date and publication_date < from_date) or (
+ until_date and publication_date > until_date
+ ):
+ continue
+
+ for file_data in _get_files(dataset_id):
+ file_persistent_id = file_data["dataFile"].get("persistentId")
+ standardized_persistent_id = (
+ standardizer.standardize_pid_generic(file_persistent_id)
+ if file_persistent_id
+ else None
+ )
+
+ yield {
+ "title": subdataverse_title,
+ "dataset_doi": doi,
+ "dataset_published": publication_date,
+ "file_id": file_data["dataFile"]["id"],
+ "file_name": file_data["label"],
+ "file_url": f"{settings.DATAVERSE_ENDPOINT}/access/datafile/{file_data['dataFile']['id']}",
+ "file_persistent_id": standardized_persistent_id,
+ }
diff --git a/core/collectors/opac.py b/core/collectors/opac.py
new file mode 100644
index 0000000..94122b7
--- /dev/null
+++ b/core/collectors/opac.py
@@ -0,0 +1,33 @@
+import logging
+
+import requests
+from django.conf import settings
+from time import sleep
+
+
+def fetch_counter_dict(from_date, until_date, page=1):
+ for attempt in range(1, settings.OPAC_MAX_RETRIES + 1):
+ params = {
+ "begin_date": from_date,
+ "end_date": until_date,
+ "page": page,
+ }
+
+ response = requests.get(url=settings.OPAC_ENDPOINT, params=params, verify=False)
+
+ try:
+ response.raise_for_status()
+ logging.info(response.url)
+ except requests.exceptions.HTTPError:
+ logging.warning(
+ "Could not collect data from %s. Waiting %d seconds for attempt %d of %d",
+ response.url,
+ settings.OPAC_SLEEP_TIME,
+ attempt,
+ settings.OPAC_MAX_RETRIES,
+ )
+ sleep(settings.OPAC_SLEEP_TIME)
+ else:
+ return response.json()
+
+ return {}
diff --git a/core/collectors/preprints.py b/core/collectors/preprints.py
new file mode 100644
index 0000000..bead72c
--- /dev/null
+++ b/core/collectors/preprints.py
@@ -0,0 +1,55 @@
+from django.conf import settings
+from sickle import Sickle
+
+from core.utils import standardizer
+
+
+def iter_records(from_date, until_date):
+ oai_client = Sickle(
+ endpoint=settings.OAI_PMH_PREPRINT_ENDPOINT,
+ max_retries=settings.OAI_PMH_MAX_RETRIES,
+ verify=False,
+ )
+ records = oai_client.ListRecords(
+ **{
+ "metadataPrefix": settings.OAI_METADATA_PREFIX,
+ "from": from_date,
+ "until": until_date,
+ }
+ )
+
+ for record in records:
+ yield record
+
+
+def extract_record_data(record):
+ pid_generic = _extract_compatible_identifier(record.header.identifier)
+ text_langs = [
+ standardizer.standardize_language_code(language)
+ for language in record.metadata.get("language", [])
+ ]
+ publication_date = record.metadata.get("date", [""])[0]
+ default_language = text_langs[0] if text_langs else ""
+ publication_year = _extract_publication_year_from_date(publication_date)
+
+ return {
+ "pid_generic": pid_generic,
+ "text_langs": text_langs,
+ "publication_date": publication_date,
+ "default_language": default_language,
+ "publication_year": publication_year,
+ }
+
+
+def _extract_compatible_identifier(identifier):
+ try:
+ return identifier.split(":")[-1].split("/")[1]
+ except IndexError:
+ return ""
+
+
+def _extract_publication_year_from_date(date_str):
+ try:
+ return date_str[:4]
+ except IndexError:
+ return ""
diff --git a/core/collectors/scielo_books.py b/core/collectors/scielo_books.py
new file mode 100644
index 0000000..b1f2dd8
--- /dev/null
+++ b/core/collectors/scielo_books.py
@@ -0,0 +1,182 @@
+import logging
+
+import requests
+from django.conf import settings
+from urllib.parse import urlencode
+
+
+
+
+def build_url(base_url, params=None):
+ if not params:
+ return base_url
+ return f"{base_url}?{urlencode(params, doseq=True)}"
+
+
+def sanitize_raw_data(payload):
+ if not isinstance(payload, dict):
+ return payload
+
+ if "_id" not in payload:
+ return payload
+
+ sanitized = dict(payload)
+ sanitized["id"] = sanitized.pop("_id")
+ return sanitized
+
+
+def fetch_document(doc_id, base_url=None, db_name=None, headers=None):
+ db_name = db_name or settings.SCIELO_BOOKS_DB_NAME
+ resolved_base_url = base_url or settings.SCIELO_BOOKS_BASE_URL
+ if not resolved_base_url:
+ logging.error("Sem base url definida para coleta de books")
+ raise ValueError("SCIELO_BOOKS_BASE_URL is not configured")
+
+ url = f"{resolved_base_url}/{db_name}/{doc_id}"
+ response = requests.get(url, headers=headers, timeout=settings.SCIELO_BOOKS_TIMEOUT, verify=False)
+ response.raise_for_status()
+ payload = response.json()
+ return sanitize_raw_data(payload), url
+
+
+def fetch_changes_page(
+ base_url=None,
+ db_name=None,
+ since=0,
+ limit=None,
+ include_docs=False,
+ headers=None,
+):
+ db_name = db_name or settings.SCIELO_BOOKS_DB_NAME
+ limit = limit or settings.SCIELO_BOOKS_LIMIT
+ resolved_base_url = base_url or settings.SCIELO_BOOKS_BASE_URL
+ if not resolved_base_url:
+ logging.error("Sem base url definida para coleta de books")
+ raise ValueError("SCIELO_BOOKS_BASE_URL is not configured")
+
+ params = {
+ "since": since,
+ "limit": limit,
+ }
+ if include_docs:
+ params["include_docs"] = "true"
+
+ url = build_url(f"{resolved_base_url}/{db_name}/_changes", params)
+ response = requests.get(url, headers=headers, timeout=settings.SCIELO_BOOKS_TIMEOUT, verify=False)
+ response.raise_for_status()
+ payload = response.json()
+ return payload if isinstance(payload, dict) else {}
+
+
+def extract_changes(payload):
+ if isinstance(payload, dict) and isinstance(payload.get("results"), list):
+ return payload.get("results")
+ return []
+
+
+def extract_last_seq(payload):
+ if isinstance(payload, dict):
+ return payload.get("last_seq") or payload.get("seq")
+ return None
+
+
+def iter_changes(
+ base_url=None,
+ db_name=None,
+ since=0,
+ limit=None,
+ headers=None,
+):
+ db_name = db_name or settings.SCIELO_BOOKS_DB_NAME
+ limit = limit or settings.SCIELO_BOOKS_LIMIT
+ current_since = since or 0
+
+ while True:
+ payload = fetch_changes_page(
+ base_url=base_url,
+ db_name=db_name,
+ since=current_since,
+ limit=limit,
+ include_docs=False,
+ headers=headers,
+ )
+ changes = extract_changes(payload)
+ if not changes:
+ break
+
+ for change in changes:
+ yield change
+
+ last_seq = extract_last_seq(payload)
+ if last_seq is None or last_seq == current_since:
+ break
+ current_since = last_seq
+
+
+def iter_change_documents(
+ base_url=None,
+ db_name=None,
+ since=0,
+ limit=None,
+ headers=None,
+):
+ db_name = db_name or settings.SCIELO_BOOKS_DB_NAME
+ limit = limit or settings.SCIELO_BOOKS_LIMIT
+ current_since = since or 0
+
+ while True:
+ payload = fetch_changes_page(
+ base_url=base_url,
+ db_name=db_name,
+ since=current_since,
+ limit=limit,
+ include_docs=True,
+ headers=headers,
+ )
+ changes = extract_changes(payload)
+ if not changes:
+ break
+
+ for change in changes:
+ doc_id = change.get("id")
+ if not doc_id:
+ continue
+
+ deleted = bool(change.get("deleted"))
+ raw_doc = change.get("doc") or {}
+ if deleted:
+ yield {
+ "change": change,
+ "deleted": True,
+ "payload": None,
+ "source_url": None,
+ }
+ continue
+
+ if raw_doc:
+ sanitized = sanitize_raw_data(raw_doc)
+ yield {
+ "change": change,
+ "deleted": False,
+ "payload": sanitized,
+ "source_url": f"{(base_url or settings.SCIELO_BOOKS_BASE_URL)}/{db_name}/{doc_id}",
+ }
+ continue
+
+ document_payload, source_url = fetch_document(
+ doc_id=doc_id,
+ base_url=base_url,
+ db_name=db_name,
+ headers=headers,
+ )
+ yield {
+ "change": change,
+ "deleted": False,
+ "payload": document_payload,
+ "source_url": source_url,
+ }
+
+ last_seq = extract_last_seq(payload)
+ if last_seq is None or last_seq == current_since:
+ break
+ current_since = last_seq
diff --git a/core/models.py b/core/models.py
index 1aeab73..2a4ecbf 100644
--- a/core/models.py
+++ b/core/models.py
@@ -11,7 +11,7 @@
from wagtailautocomplete.edit_handlers import AutocompletePanel
from . import choices
-from .utils.utils import language_iso
+from .utils.standardizer import language_iso
User = get_user_model()
diff --git a/article/__init__.py b/core/tests/__init__.py
similarity index 100%
rename from article/__init__.py
rename to core/tests/__init__.py
diff --git a/core/tests/tests_collectors.py b/core/tests/tests_collectors.py
new file mode 100644
index 0000000..6d13a7c
--- /dev/null
+++ b/core/tests/tests_collectors.py
@@ -0,0 +1,55 @@
+import unittest
+from unittest.mock import patch
+
+from core.collectors import scielo_books
+
+
+class SciELOBooksCollectorTests(unittest.TestCase):
+ def test_build_url_appends_query_params(self):
+ url = scielo_books.build_url(
+ "https://books.example/_changes",
+ {"since": 10, "limit": 100},
+ )
+
+ self.assertEqual(url, "https://books.example/_changes?since=10&limit=100")
+
+ def test_sanitize_raw_data_renames__id(self):
+ payload = {"_id": "abc123", "TYPE": "Monograph"}
+
+ sanitized = scielo_books.sanitize_raw_data(payload)
+
+ self.assertEqual(sanitized["id"], "abc123")
+ self.assertNotIn("_id", sanitized)
+ self.assertEqual(sanitized["TYPE"], "Monograph")
+
+ def test_extract_last_seq_accepts_both_couch_formats(self):
+ self.assertEqual(scielo_books.extract_last_seq({"last_seq": 123}), 123)
+ self.assertEqual(scielo_books.extract_last_seq({"seq": 456}), 456)
+
+ @patch("core.collectors.scielo_books.fetch_document")
+ @patch("core.collectors.scielo_books.fetch_changes_page")
+ def test_iter_change_documents_uses_docs_from_changes_payload(self, mock_fetch_changes_page, mock_fetch_document):
+ mock_fetch_changes_page.side_effect = [
+ {
+ "results": [
+ {
+ "seq": 10,
+ "id": "book1",
+ "doc": {"_id": "book1", "TYPE": "Monograph", "title": "Book One"},
+ }
+ ],
+ "last_seq": 10,
+ },
+ {"results": [], "last_seq": 10},
+ ]
+
+ results = list(scielo_books.iter_change_documents(base_url="https://books.example", db_name="scielobooks_1a"))
+
+ self.assertEqual(len(results), 1)
+ self.assertEqual(results[0]["payload"]["id"], "book1")
+ self.assertEqual(results[0]["payload"]["TYPE"], "Monograph")
+ mock_fetch_document.assert_not_called()
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/core/tests_date_utils.py b/core/tests/tests_date_utils.py
similarity index 87%
rename from core/tests_date_utils.py
rename to core/tests/tests_date_utils.py
index 9f2b657..8d4f9b6 100644
--- a/core/tests_date_utils.py
+++ b/core/tests/tests_date_utils.py
@@ -86,7 +86,15 @@ def test_extract_minute_second_key(self):
key = extract_minute_second_key(dt)
self.assertEqual(key, '30:45')
+ def test_extract_minute_second_key_returns_none_for_invalid_datetime(self):
+ self.assertIsNone(extract_minute_second_key(None))
+ self.assertIsNone(extract_minute_second_key("invalid-date"))
+
def test_truncate_datetime_to_hour(self):
dt = datetime(2023, 3, 15, 14, 30, 45)
truncated = truncate_datetime_to_hour(dt)
self.assertEqual(truncated, datetime(2023, 3, 15, 14, 0, 0))
+
+ def test_truncate_datetime_to_hour_returns_none_for_invalid_datetime(self):
+ self.assertIsNone(truncate_datetime_to_hour(None))
+ self.assertIsNone(truncate_datetime_to_hour("invalid-date"))
diff --git a/core/tests_standardizer.py b/core/tests_standardizer.py
deleted file mode 100644
index a50ff87..0000000
--- a/core/tests_standardizer.py
+++ /dev/null
@@ -1,201 +0,0 @@
-from django.test import TestCase
-
-from core.utils import standardizer
-
-
-class StandardizerStandardizeCodeAndNameTest(TestCase):
-
- def test_standardize_code_and_name_returns_both(self):
- expected = [{"code": "CE", "name": "Ceará"}]
- text = "Ceará / CE"
- result = standardizer.standardize_code_and_name(text)
- for i, item in enumerate(result):
- with self.subTest(i):
- self.assertDictEqual(expected[i], item)
-
- def test_standardize_code_and_name_returns_acronym(self):
- expected = [{"code": "CE", }]
- text = "CE"
- result = standardizer.standardize_code_and_name(text)
- for i, item in enumerate(result):
- with self.subTest(i):
- self.assertDictEqual(expected[i], item)
-
- def test_standardize_code_and_name_returns_name(self):
- expected = [{"name": "Ceará"}]
- text = "Ceará"
- result = standardizer.standardize_code_and_name(text)
- for i, item in enumerate(result):
- with self.subTest(i):
- self.assertDictEqual(expected[i], item)
-
- def test_standardize_code_and_name_returns_more_than_one_both(self):
- expected = [{"code": "CE", "name": "Ceará"},
- {"code": "SP", "name": "São Paulo"}]
- text = "Ceará / CE, São Paulo / SP"
- result = standardizer.standardize_code_and_name(text)
- for i, item in enumerate(result):
- with self.subTest(i):
- self.assertDictEqual(expected[i], item)
-
- def test_standardize_code_and_name_returns_more_than_one_acronym(self):
- expected = [{"code": "CE", }, {"code": "SP", }]
- text = "CE / SP"
- result = standardizer.standardize_code_and_name(text)
- for i, item in enumerate(result):
- with self.subTest(i):
- self.assertDictEqual(expected[i], item)
-
- def test_standardize_code_and_name_returns_more_than_one_name(self):
- expected = [{"name": "Ceará"}, {"name": "São Paulo"}]
- text = "Ceará - São Paulo"
- result = standardizer.standardize_code_and_name(text)
- for i, item in enumerate(result):
- with self.subTest(i):
- self.assertDictEqual(expected[i], item)
-
-
-class StandardizerStandardizeNameTest(TestCase):
-
- def test_standardize_name(self):
- expected = ["Txto 1", "Texto 2", "Texto 3"]
- text = "Txto 1, Texto 2, Texto 3"
- result = standardizer.standardize_name(text)
- for i, item in enumerate(result):
- with self.subTest(i):
- self.assertEqual({"name": expected[i]}, item)
-
-
-class StandardizerStandardizeLanguageCode(TestCase):
- def test_standardize_language_code_en_us_is_valid(self):
- language_code = 'en-US'
- standardized = standardizer.standardize_language_code(language_code)
- self.assertEqual(standardized, 'en')
-
- def test_standardize_language_code_esp_is_valid(self):
- language_code = 'esp'
- standardized = standardizer.standardize_language_code(language_code)
- self.assertEqual(standardized, 'es')
-
- def test_standardize_language_code_pt_br_is_valid(self):
- language_code = 'pt-BR'
- standardized = standardizer.standardize_language_code(language_code)
- self.assertEqual(standardized, 'pt')
-
- def test_standardize_language_code_es_is_valid(self):
- language_code = 'spa'
- standardized = standardizer.standardize_language_code(language_code)
- self.assertEqual(standardized, 'es')
-
- def test_standardize_language_code_en_gb_is_valid(self):
- language_code = 'en-GB'
- standardized = standardizer.standardize_language_code(language_code)
- self.assertEqual(standardized, 'en')
-
-
-class StandardizerStandardizePIDV3(TestCase):
- def test_standardize_pid_v3_is_valid(self):
- pid_v3 = 'jGJccQ7bFdbz6wy3nfXGVdv'
- standardized = standardizer.standardize_pid_v3(pid_v3)
- self.assertEqual(standardized, 'jGJccQ7bFdbz6wy3nfXGVdv')
-
-
-class StandardizerStandardizePIDV2(TestCase):
- def test_standardize_pid_v2_is_valid(self):
- pid_v2 = 'S0102-67202020000100001'
- standardized = standardizer.standardize_pid_v2(pid_v2)
- self.assertEqual(standardized, 'S0102-67202020000100001')
-
-
-class StandardizerStandardizeDOI(TestCase):
- def test_standardize_doi_is_valid(self):
- doi = '10.1590/S0102-67202020000100001'
- standardized = standardizer.standardize_doi(doi)
- self.assertEqual(standardized, '10.1590/S0102-67202020000100001')
-
- def test_standardize_doi_is_valid_with_doi_prefix(self):
- doi = 'doi:10.1590/S0102-67202020000100001'
- standardized = standardizer.standardize_doi(doi)
- self.assertEqual(standardized, '10.1590/S0102-67202020000100001')
-
- def test_standardize_doi_is_valid_with_http_prefix(self):
- doi = 'http://doi.org/10.1590/S0102-67202020000100001'
- standardized = standardizer.standardize_doi(doi)
- self.assertEqual(standardized, '10.1590/S0102-67202020000100001')
-
- def test_standardize_doi_is_valid_with_https_prefix(self):
- doi = 'https://doi.org/10.1590/S0102-67202020000100001'
- standardized = standardizer.standardize_doi(doi)
- self.assertEqual(standardized, '10.1590/S0102-67202020000100001')
-
- def test_standardize_doi_is_valid_with_doi_prefix_and_http_prefix(self):
- doi = 'doi:http://doi.org/10.1590/S0102-67202020000100001'
- standardized = standardizer.standardize_doi(doi)
- self.assertEqual(standardized, '10.1590/S0102-67202020000100001')
-
- def test_standardize_doi_is_valid_with_doi_prefix_and_https_prefix(self):
- doi = 'doi:https://doi.org/10.1590/S0102-67202020000100001'
- standardized = standardizer.standardize_doi(doi)
- self.assertEqual(standardized, '10.1590/S0102-67202020000100001')
-
-
-class TestStandardizeYearOfPublication(TestCase):
- def test_standardize_year_of_publication_four_digit_year(self):
- """Test that a four-digit year is returned as-is"""
- year = "2023"
- result = standardizer.standardize_year_of_publication(year)
- self.assertEqual(result, "2023")
-
- def test_standardize_year_of_publication_integer_year(self):
- """Test that an integer year is converted to string"""
- year = 2023
- result = standardizer.standardize_year_of_publication(year)
- self.assertEqual(result, "2023")
-
- def test_standardize_year_of_publication_year_range(self):
- """Test that a year range returns the first year"""
- year = "2020-2023"
- result = standardizer.standardize_year_of_publication(year)
- self.assertEqual(result, "2020")
-
- def test_standardize_year_of_publication_year_with_slash(self):
- """Test that a year with slash returns the first year"""
- year = "2020/2023"
- result = standardizer.standardize_year_of_publication(year)
- self.assertEqual(result, "2020")
-
- def test_standardize_year_of_publication_year_with_extra_text(self):
- """Test that year with extra text extracts the year"""
- year = "Published in 2023"
- result = standardizer.standardize_year_of_publication(year)
- self.assertEqual(result, "")
-
- def test_standardize_year_of_publication_invalid_year(self):
- """Test that invalid year returns None or empty string"""
- year = "invalid"
- result = standardizer.standardize_year_of_publication(year)
- self.assertEqual(result, '')
-
- def test_standardize_year_of_publication_empty_string(self):
- """Test that empty string returns None or empty string"""
- year = ""
- result = standardizer.standardize_year_of_publication(year)
- self.assertEqual(result, '')
-
- def test_standardize_year_of_publication_none_input(self):
- """Test that None input returns None"""
- year = None
- result = standardizer.standardize_year_of_publication(year)
- self.assertEqual(result, '')
-
- def test_standardize_year_of_publication_two_digit_year(self):
- """Test that two-digit year is converted to four-digit year"""
- year = "23"
- result = standardizer.standardize_year_of_publication(year)
- self.assertEqual(result, '')
-
- def test_standardize_year_of_publication_year_with_parentheses(self):
- """Test that year in parentheses is extracted"""
- year = "(2023)"
- result = standardizer.standardize_year_of_publication(year)
- self.assertEqual(result, '')
diff --git a/metrics/utils/file_utils.py b/core/utils/csv_utils.py
similarity index 100%
rename from metrics/utils/file_utils.py
rename to core/utils/csv_utils.py
diff --git a/core/utils/date_utils.py b/core/utils/date_utils.py
index 026d434..f20ffea 100644
--- a/core/utils/date_utils.py
+++ b/core/utils/date_utils.py
@@ -29,7 +29,7 @@ def get_date_obj(date_str: str, format: str = "%Y-%m-%d") -> datetime.date:
try:
return datetime.strptime(date_str, format).date()
except (ValueError, TypeError):
- ...
+ return None
def get_date_range_str(from_date_str: str = None, until_date_str: str = None, days_to_go_back: int = None) -> tuple[str, str]:
@@ -99,12 +99,9 @@ def truncate_datetime_to_hour(dt):
Returns:
datetime: The truncated datetime object.
"""
- if isinstance(dt, str):
- try:
- dt = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S")
- except ValueError:
- logging.error("Invalid datetime string format. Expected '%Y-%m-%d %H:%M:%S'.")
- return None
+ dt = _coerce_datetime(dt)
+ if dt is None:
+ return None
return dt.replace(minute=0, second=0, microsecond=0)
@@ -119,11 +116,23 @@ def extract_minute_second_key(dt):
Returns:
str: A string in the format "MM:SS" representing the minute and second.
"""
+ dt = _coerce_datetime(dt)
+ if dt is None:
+ return None
+
+ return f"{dt.minute:02}:{dt.second:02}"
+
+
+def _coerce_datetime(dt):
+ if isinstance(dt, datetime):
+ return dt
+
if isinstance(dt, str):
try:
- dt = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S")
+ return datetime.strptime(dt, "%Y-%m-%d %H:%M:%S")
except ValueError:
logging.error("Invalid datetime string format. Expected '%Y-%m-%d %H:%M:%S'.")
return None
- return f"{dt.minute:02}:{dt.second:02}"
+ logging.error("Invalid datetime value: %r.", dt)
+ return None
diff --git a/core/utils/utils.py b/core/utils/request_utils.py
similarity index 90%
rename from core/utils/utils.py
rename to core/utils/request_utils.py
index 0397338..c4fbec6 100644
--- a/core/utils/utils.py
+++ b/core/utils/request_utils.py
@@ -1,8 +1,6 @@
import logging
-import re
import requests
-from langcodes import standardize_tag, tag_is_valid
from tenacity import (
retry,
retry_if_exception_type,
@@ -17,13 +15,6 @@
User = get_user_model()
-def language_iso(code):
- code = re.split(r"-|_", code)[0] if code else ""
- if tag_is_valid(code):
- return standardize_tag(code)
- return ""
-
-
class RetryableError(Exception):
"""Recoverable error without having to modify the data state on the client
side, e.g. timeouts, errors from network partitioning, etc.
@@ -92,4 +83,4 @@ def _get_user(request, username=None, user_id=None):
if user_id:
return User.objects.get(pk=user_id)
if username:
- return User.objects.get(username=username)
\ No newline at end of file
+ return User.objects.get(username=username)
diff --git a/core/utils/standardizer.py b/core/utils/standardizer.py
index 27b5cba..c228bf5 100644
--- a/core/utils/standardizer.py
+++ b/core/utils/standardizer.py
@@ -1,247 +1,77 @@
-import langcodes
import re
-
-ITEMS_SEP_FOR_LOCATION = [";", ", ", "|", "/"]
-PARTS_SEP_FOR_LOCATION = [" - ", "- ", " -", ", ", "(", "/"]
-
-ITEMS_SEP_FOR_CITY = [",", "|"]
-PARTS_SEP_FOR_CITY = []
-
-
-def remove_extra_spaces(text):
- text = text and text.strip()
- if not text:
- return text
- # padroniza a quantidade de espaços
- return " ".join([item.strip() for item in text.split() if item.strip()])
-
-
-def standardize_code_and_name(original):
- """
- Dado o texto original, identifica pares de code e nome.
- Os separadores podem separar code e nome e/ou itens de lista.
- Ex.: USP / Unicamp
- São Paulo/SP, Rio de Janeiro/RJ
- """
- text_ = original
- text_ = text_ and text_.strip()
- if not text_:
- return []
-
- text_ = remove_extra_spaces(text_)
- if not text_:
- yield {"name": None}
- return
-
- items_separators = ITEMS_SEP_FOR_LOCATION
- parts_separators = PARTS_SEP_FOR_LOCATION
-
- PARTBR = "~PARTBR~"
- LINEBR = "~LINEBR~"
- for sep in items_separators:
- text_ = text_.replace(sep, PARTBR)
- for sep in parts_separators:
- text_ = text_.replace(sep, PARTBR)
-
- codes = []
- names = []
- for item in text_.split(PARTBR):
- item = item.strip()
- if not item:
- continue
- if len(item) == 2:
- codes.append(item)
- else:
- names.append(item)
-
- if len(names) == len(codes):
- for acron, name in zip(codes, names):
- yield {"code": acron, "name": name}
- elif len(names) == 0:
- for acron in codes:
- yield {"code": acron}
- elif len(codes) == 0:
- for name in names:
- yield {"name": name}
- else:
- # como o texto está bem fora do padrão,
- # pode-se evidenciar retornando o original
- yield {"name": original}
-
-
-def standardize_name(original):
- original = original and original.strip()
- if not original:
- return
-
- items_separators = ITEMS_SEP_FOR_CITY
-
- LINEBR = "~LINEBR~"
-
- text_ = original
- text_ = remove_extra_spaces(text_)
-
- for sep in items_separators:
- text_ = text_.replace(sep, LINEBR)
-
- for row in text_.split(LINEBR):
- row = row and row.strip()
- if not row:
- continue
- yield {"name": row}
+import langcodes
def standardize_language_code(language_code: str, threshold=0.75):
- """
- Standardizes a media language using langcodes library.
-
- Parameters:
- media_language (str): The media language to be standardized.
- threshold (float): The minimum score for a language to be considered valid. Default is 0.75.
-
- Returns:
- str: The standardized media language or None if the input is not a valid language tag.
- """
- if not language_code:
- return 'un'
-
- if langcodes.tag_is_valid(language_code):
- return langcodes.standardize_tag(language_code).split('-')[0]
-
- # Handle special cases
- if language_code.lower() == 'esp':
- return 'es'
-
- inferred_lang, score = langcodes.best_match(language_code, langcodes.LANGUAGE_ALPHA3.keys())
-
- if score >= threshold:
- return langcodes.standardize_tag(inferred_lang).split('-')[0]
-
- # Handle unknown languages
- return 'un'
+ language_code = str(language_code).strip().strip("'\"")
+ lang = langcodes.get(language_code)
+ try:
+ parts = str(lang).split("-")
+ except Exception:
+ return "un"
+ return parts[0]
def standardize_pid_v2(pid_v2):
- """
- Standardizes a PID v2.
-
- Parameters:
- pid_v2 (str): The PID v2 to be standardized.
-
- Returns:
- str: The standardized PID v2 or an empty string if the input is not a valid PID v2.
- """
- if not pid_v2 or not pid_v2.lower().startswith('s') or len(pid_v2) < 23:
- return ''
-
+ if not pid_v2 or not pid_v2.lower().startswith("s") or len(pid_v2) < 23:
+ return ""
+
if len(pid_v2) == 23:
return pid_v2[0].upper() + pid_v2[1:]
-
+
if len(pid_v2) > 23:
return pid_v2[0].upper() + pid_v2[1:23]
-
- if len(pid_v2) < 23:
- return ''
+ return ""
-def standardize_pid_v3(pid_v3):
- """
- Standardizes a PID v3 using langcodes library."
-
- Parameters:
- pid_v3 (str): The PID v3 to be standardized.
- Returns:
- str: The standardized PID v3 or an empty string if the input is not a valid PID v3.
- """
-
- if not pid_v3:
- return ''
-
- if len(pid_v3) == 23:
- return pid_v3
-
- if len(pid_v3) > 23:
- return pid_v3[:23]
-
- if len(pid_v3) < 23:
- return ''
+def standardize_pid_v3(pid_v3):
+ return str(pid_v3 or "")
def standardize_doi(text):
- """"
- Standardizes a DOI.
-
- Parameters:
- text (str): The DOI to be standardized.
-
- Returns:
- str: The standardized DOI
- """
- PATTERNS_DOI = [re.compile(pd) for pd in [
- r'10.\d{4,9}/[-._;()/:A-Z0-9]+$',
- r'10.1002/[^\s]+$',
- r'10.\d{4}/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d$',
- r'10.1207/[\w\d]+\&\d+_\d+$',
- r'10.\d{4,9}/[-._;()/:a-zA-Z0-9]*']
+ text = (text or "").strip()
+ if not text:
+ return ""
+
+ doi_prefixes = [
+ "https://doi.org/",
+ "http://doi.org/",
+ "https://dx.doi.org/",
+ "http://dx.doi.org/",
+ "doi.org/",
+ "dx.doi.org/",
+ "doi:",
]
- matched_doi = False
-
- for pattern_doi in PATTERNS_DOI:
- matched_doi = pattern_doi.search(text)
- if matched_doi:
+ for prefix in doi_prefixes:
+ if text.lower().startswith(prefix):
+ text = text[len(prefix):]
break
- if not matched_doi:
- return
-
- return matched_doi.group().upper()
+ if text.lower().startswith("10."):
+ return text
+
+ return ""
def standardize_pid_generic(pid_generic):
- """
- Standardizes a PID."
-
- Parameters:
- pid_generic (str): The PID to be standardized.
-
- Returns:
- str: The standardized PID or an empty string if the input is not a valid PID.
- """
-
- if not pid_generic:
- return ''
-
- pid_generic_based_on_doi = standardize_doi(pid_generic)
- if pid_generic_based_on_doi:
- return pid_generic_based_on_doi
-
- return pid_generic.strip().upper()
+ value = str(pid_generic or "").strip().upper()
+ value = re.sub(r"\s+", "", value)
+ value = value.rstrip(".,;:")
+ return value or ""
def standardize_year_of_publication(year_of_publication):
- """
- Standardizes a year of publication.
-
- Parameters:
- year_of_publication (str): The year of publication to be standardized.
-
- Returns:
- str: The standardized year of publication or an empty string if the input is not a valid year.
- """
- if not year_of_publication:
- return ''
-
- # Truncate to 4 characters if longer
- if isinstance(year_of_publication, str) and len(year_of_publication) > 4:
- year_of_publication = year_of_publication[:4]
-
- try:
- year = int(year_of_publication)
- if 1500 <= year <= 2100:
- return str(year)
- except ValueError:
- pass
-
- return ''
+ value = str(year_of_publication or "").strip()
+ if not value:
+ return ""
+ match = re.match(r"(\d{4})", value)
+ return match.group(1) if match else ""
+
+
+def language_iso(code):
+ code = re.split(r"-|_", code)[0] if code else ""
+ if langcodes.tag_is_valid(code):
+ return langcodes.standardize_tag(code)
+ return ""
diff --git a/django_celery_beat/views.py b/django_celery_beat/views.py
index 3a4ddb0..b5cff84 100644
--- a/django_celery_beat/views.py
+++ b/django_celery_beat/views.py
@@ -21,6 +21,13 @@ def task_run(request):
task = current_app.tasks.get(p_task.task)
+ if task is None:
+ messages.error(
+ request,
+ _("Task '{0}' not found in the Celery registry.").format(p_task.task),
+ )
+ return redirect(request.META.get("HTTP_REFERER"))
+
kwargs = json.loads(p_task.kwargs)
kwargs["user_id"] = request.user.id
diff --git a/docs/Makefile b/docs/Makefile
deleted file mode 100644
index 6957700..0000000
--- a/docs/Makefile
+++ /dev/null
@@ -1,29 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS ?=
-SPHINXBUILD ?= sphinx-build
-SOURCEDIR = .
-BUILDDIR = ./_build
-APP = /app
-
-.PHONY: help livehtml apidocs Makefile
-
-# Put it first so that "make" without argument is like "make help".
-help:
- @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -c .
-
-# Build, watch and serve docs with live reload
-livehtml:
- sphinx-autobuild -b html --host 0.0.0.0 --port 9000 --watch $(APP) -c . $(SOURCEDIR) $(BUILDDIR)/html
-
-# Outputs rst files from django application code
-apidocs:
- sphinx-apidoc -o $(SOURCEDIR)/api $(APP)
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
- @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -c .
diff --git a/docs/__init__.py b/docs/__init__.py
deleted file mode 100644
index 8772c82..0000000
--- a/docs/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Included so that Django's startproject comment runs against the docs directory
diff --git a/docs/conf.py b/docs/conf.py
deleted file mode 100644
index 51cd921..0000000
--- a/docs/conf.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-
-import os
-import sys
-
-import django
-
-if os.getenv("READTHEDOCS", default=False) == "True":
- sys.path.insert(0, os.path.abspath(".."))
- os.environ["DJANGO_READ_DOT_ENV_FILE"] = "True"
- os.environ["USE_DOCKER"] = "no"
-else:
- sys.path.insert(0, os.path.abspath("/app"))
-os.environ["DATABASE_URL"] = "sqlite:///readthedocs.db"
-os.environ["CELERY_BROKER_URL"] = os.getenv("REDIS_URL", "redis://redis:6379")
-os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings.local")
-django.setup()
-
-# -- Project information -----------------------------------------------------
-
-project = "SciELO Core"
-copyright = """2022, SciELO"""
-author = "SciELO"
-
-
-# -- General configuration ---------------------------------------------------
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
- "sphinx.ext.autodoc",
- "sphinx.ext.napoleon",
-]
-
-# Add any paths that contain templates here, relative to this directory.
-# templates_path = ["_templates"]
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages. See the documentation for
-# a list of builtin themes.
-#
-html_theme = "alabaster"
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-# html_static_path = ["_static"]
diff --git a/docs/howto.rst b/docs/howto.rst
deleted file mode 100644
index 9fae300..0000000
--- a/docs/howto.rst
+++ /dev/null
@@ -1,38 +0,0 @@
-How To - Project Documentation
-======================================================================
-
-Get Started
-----------------------------------------------------------------------
-
-Documentation can be written as rst files in `core/docs`.
-
-
-To build and serve docs, use the commands::
-
- docker compose -f local.yml up docs
-
-
-
-Changes to files in `docs/_source` will be picked up and reloaded automatically.
-
-`Sphinx `_ is the tool used to build documentation.
-
-Docstrings to Documentation
-----------------------------------------------------------------------
-
-The sphinx extension `apidoc `_ is used to automatically document code using signatures and docstrings.
-
-Numpy or Google style docstrings will be picked up from project files and availble for documentation. See the `Napoleon `_ extension for details.
-
-For an in-use example, see the `page source <_sources/users.rst.txt>`_ for :ref:`users`.
-
-To compile all docstrings automatically into documentation source files, use the command:
- ::
-
- make apidocs
-
-
-This can be done in the docker container:
- ::
-
- docker run --rm docs make apidocs
diff --git a/docs/index.rst b/docs/index.rst
deleted file mode 100644
index b6c6ded..0000000
--- a/docs/index.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-.. SciELO Content Manager documentation master file, created by
- sphinx-quickstart.
- You can adapt this file completely to your liking, but it should at least
- contain the root `toctree` directive.
-
-Welcome to SciELO Core's documentation!
-======================================================================
-
-.. toctree::
- :maxdepth: 2
- :caption: Contents:
-
- howto
- users
-
-
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
diff --git a/docs/make.bat b/docs/make.bat
deleted file mode 100644
index 4f70eed..0000000
--- a/docs/make.bat
+++ /dev/null
@@ -1,46 +0,0 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-
-if "%SPHINXBUILD%" == "" (
- set SPHINXBUILD=sphinx-build -c .
-)
-set SOURCEDIR=_source
-set BUILDDIR=_build
-set APP=..\core
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
- echo.
- echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
- echo.installed, then set the SPHINXBUILD environment variable to point
- echo.to the full path of the 'sphinx-build' executable. Alternatively you
- echo.may add the Sphinx directory to PATH.
- echo.
- echo.Install sphinx-autobuild for live serving.
- echo.If you don't have Sphinx installed, grab it from
- echo.http://sphinx-doc.org/
- exit /b 1
-)
-
-%SPHINXBUILD% -b %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:livehtml
-sphinx-autobuild -b html --open-browser -p 9000 --watch %APP% -c . %SOURCEDIR% %BUILDDIR%/html
-GOTO :EOF
-
-:apidocs
-sphinx-apidoc -o %SOURCEDIR%/api %APP%
-GOTO :EOF
-
-:help
-%SPHINXBUILD% -b help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
diff --git a/docs/users.rst b/docs/users.rst
deleted file mode 100644
index 21e08aa..0000000
--- a/docs/users.rst
+++ /dev/null
@@ -1,15 +0,0 @@
- .. _users:
-
-Users
-======================================================================
-
-Starting a new project, it’s highly recommended to set up a custom user model,
-even if the default User model is sufficient for you.
-
-This model behaves identically to the default user model,
-but you’ll be able to customize it in the future if the need arises.
-
-.. automodule:: core.users.models
- :members:
- :noindex:
-
diff --git a/document/__init__.py b/document/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/document/__init__.py
@@ -0,0 +1 @@
+
diff --git a/journal/apps.py b/document/apps.py
similarity index 62%
rename from journal/apps.py
rename to document/apps.py
index e10a171..eb482d2 100644
--- a/journal/apps.py
+++ b/document/apps.py
@@ -1,6 +1,6 @@
from django.apps import AppConfig
-class JournalConfig(AppConfig):
+class DocumentConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
- name = "journal"
+ name = "document"
diff --git a/document/management/__init__.py b/document/management/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/document/management/__init__.py
@@ -0,0 +1 @@
+
diff --git a/document/management/commands/__init__.py b/document/management/commands/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/document/management/commands/__init__.py
@@ -0,0 +1 @@
+
diff --git a/document/management/commands/load_articles_by_year.py b/document/management/commands/load_articles_by_year.py
new file mode 100644
index 0000000..a922456
--- /dev/null
+++ b/document/management/commands/load_articles_by_year.py
@@ -0,0 +1,80 @@
+from django.core.management.base import BaseCommand
+
+from document.tasks import task_load_documents_from_article_meta
+from document.tasks import task_load_documents_from_opac
+
+
+class Command(BaseCommand):
+ help = "Generate task requests for loading document data by year"
+
+ def add_arguments(self, parser):
+ parser.add_argument(
+ "--start-year",
+ type=int,
+ default=1990,
+ help="Start year (default: 1990)",
+ )
+ parser.add_argument(
+ "--end-year",
+ type=int,
+ default=2025,
+ help="End year (default: 2025)",
+ )
+ parser.add_argument(
+ "--collection",
+ type=str,
+ default="scl",
+ help="Collection code (default: scl)",
+ )
+ parser.add_argument(
+ "--task",
+ choices=["load_documents_from_opac", "load_documents_from_article_meta"],
+ default="load_documents_from_opac",
+ help="Task to execute (default: load_documents_from_opac)",
+ )
+
+ def handle(self, *args, **options):
+ start_year = options["start_year"]
+ end_year = options["end_year"]
+ collection = options["collection"]
+
+ self.stdout.write(
+ self.style.SUCCESS(
+ f"Generating task requests from {start_year} to {end_year} for collection: {collection}"
+ )
+ )
+
+ total_tasks = 0
+
+ for year in range(start_year, end_year + 1):
+ from_date = f"{year}-01-01"
+ until_date = f"{year}-12-31"
+
+ self.stdout.write(f"Queuing task for year {year}...")
+
+ if options["task"] == "load_documents_from_article_meta":
+ task_result = task_load_documents_from_article_meta.delay(
+ from_date=from_date,
+ until_date=until_date,
+ collection=collection,
+ )
+ else:
+ task_result = task_load_documents_from_opac.delay(
+ from_date=from_date,
+ until_date=until_date,
+ collection=collection,
+ )
+
+ total_tasks += 1
+
+ self.stdout.write(
+ self.style.SUCCESS(
+ f"✓ Task queued for year {year}: {from_date} to {until_date} (Task ID: {task_result.id})"
+ )
+ )
+
+ self.stdout.write(
+ self.style.SUCCESS(
+ f"\nCompleted! {total_tasks} tasks have been queued successfully."
+ )
+ )
diff --git a/document/migrations/0001_initial.py b/document/migrations/0001_initial.py
new file mode 100644
index 0000000..bff11be
--- /dev/null
+++ b/document/migrations/0001_initial.py
@@ -0,0 +1,279 @@
+# Generated by Django 5.0.7 on 2026-03-15 00:00
+
+import django.db.models.deletion
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ initial = True
+
+ dependencies = [
+ ("collection", "0001_initial"),
+ ("source", "0001_initial"),
+ migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name="Document",
+ fields=[
+ (
+ "id",
+ models.BigAutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ (
+ "created",
+ models.DateTimeField(
+ auto_now_add=True,
+ verbose_name="Creation date",
+ ),
+ ),
+ (
+ "updated",
+ models.DateTimeField(
+ auto_now=True,
+ verbose_name="Last update date",
+ ),
+ ),
+ (
+ "document_type",
+ models.CharField(
+ choices=[
+ ("article", "Article"),
+ ("preprint", "Preprint"),
+ ("dataset", "Dataset"),
+ ("book", "Book"),
+ ("chapter", "Chapter"),
+ ("other", "Other"),
+ ],
+ db_index=True,
+ max_length=32,
+ verbose_name="Document Type",
+ ),
+ ),
+ (
+ "document_id",
+ models.CharField(
+ db_index=True,
+ max_length=255,
+ verbose_name="Document ID",
+ ),
+ ),
+ (
+ "scielo_issn",
+ models.CharField(
+ blank=True,
+ db_index=True,
+ max_length=9,
+ null=True,
+ verbose_name="SciELO ISSN",
+ ),
+ ),
+ (
+ "pid_v2",
+ models.CharField(
+ blank=True,
+ db_index=True,
+ max_length=23,
+ null=True,
+ verbose_name="PID V2",
+ ),
+ ),
+ (
+ "pid_v3",
+ models.CharField(
+ blank=True,
+ db_index=True,
+ max_length=23,
+ null=True,
+ verbose_name="PID V3",
+ ),
+ ),
+ (
+ "pid_generic",
+ models.CharField(
+ blank=True,
+ db_index=True,
+ max_length=255,
+ null=True,
+ verbose_name="PID Generic",
+ ),
+ ),
+ (
+ "title",
+ models.CharField(
+ blank=True,
+ max_length=500,
+ null=True,
+ verbose_name="Document Title",
+ ),
+ ),
+ (
+ "identifiers",
+ models.JSONField(
+ blank=True,
+ default=dict,
+ null=True,
+ verbose_name="Identifiers",
+ ),
+ ),
+ (
+ "files",
+ models.JSONField(
+ blank=True,
+ default=dict,
+ null=True,
+ verbose_name="Files",
+ ),
+ ),
+ (
+ "default_lang",
+ models.CharField(
+ blank=True,
+ max_length=8,
+ null=True,
+ verbose_name="Default Language",
+ ),
+ ),
+ (
+ "text_langs",
+ models.JSONField(
+ blank=True,
+ default=list,
+ null=True,
+ verbose_name="Text Languages",
+ ),
+ ),
+ (
+ "default_media_format",
+ models.CharField(
+ blank=True,
+ max_length=32,
+ null=True,
+ verbose_name="Default Media Format",
+ ),
+ ),
+ (
+ "processing_date",
+ models.CharField(
+ blank=True,
+ max_length=32,
+ null=True,
+ verbose_name="Processing Date",
+ ),
+ ),
+ (
+ "publication_date",
+ models.CharField(
+ blank=True,
+ max_length=32,
+ null=True,
+ verbose_name="Publication Date",
+ ),
+ ),
+ (
+ "publication_year",
+ models.CharField(
+ blank=True,
+ db_index=True,
+ max_length=4,
+ null=True,
+ verbose_name="Publication Year",
+ ),
+ ),
+ (
+ "extra_data",
+ models.JSONField(
+ blank=True,
+ default=dict,
+ null=True,
+ verbose_name="Extra Data",
+ ),
+ ),
+ (
+ "collection",
+ models.ForeignKey(
+ on_delete=django.db.models.deletion.CASCADE,
+ to="collection.collection",
+ verbose_name="Collection",
+ ),
+ ),
+ (
+ "creator",
+ models.ForeignKey(
+ editable=False,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="%(class)s_creator",
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Creator",
+ ),
+ ),
+ (
+ "parent_document",
+ models.ForeignKey(
+ blank=True,
+ db_index=True,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="child_documents",
+ to="document.document",
+ verbose_name="Parent Document",
+ ),
+ ),
+ (
+ "source",
+ models.ForeignKey(
+ blank=True,
+ db_index=True,
+ null=True,
+ on_delete=django.db.models.deletion.CASCADE,
+ related_name="documents",
+ to="source.source",
+ verbose_name="Source",
+ ),
+ ),
+ (
+ "updated_by",
+ models.ForeignKey(
+ blank=True,
+ editable=False,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="%(class)s_last_mod_user",
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Updater",
+ ),
+ ),
+ ],
+ options={
+ "verbose_name": "Document",
+ "verbose_name_plural": "Documents",
+ "unique_together": {("collection", "document_type", "document_id")},
+ "indexes": [
+ models.Index(
+ fields=["collection", "document_type"],
+ name="document_collection_type_idx",
+ ),
+ models.Index(
+ fields=["collection", "scielo_issn"],
+ name="document_collection_issn_idx",
+ ),
+ models.Index(
+ fields=["collection", "pid_v2"],
+ name="document_collection_pidv2_idx",
+ ),
+ models.Index(
+ fields=["collection", "pid_generic"],
+ name="doc_coll_pidgen_idx",
+ ),
+ ],
+ },
+ ),
+ ]
diff --git a/document/migrations/__init__.py b/document/migrations/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/document/migrations/__init__.py
@@ -0,0 +1 @@
+
diff --git a/document/models.py b/document/models.py
new file mode 100644
index 0000000..5197692
--- /dev/null
+++ b/document/models.py
@@ -0,0 +1,258 @@
+from django.db import models
+from django.utils.translation import gettext_lazy as _
+
+from collection.models import Collection
+from core.models import CommonControlField
+from source.models import Source
+
+
+class Document(CommonControlField):
+ DOCUMENT_TYPE_ARTICLE = "article"
+ DOCUMENT_TYPE_PREPRINT = "preprint"
+ DOCUMENT_TYPE_DATASET = "dataset"
+ DOCUMENT_TYPE_BOOK = "book"
+ DOCUMENT_TYPE_CHAPTER = "chapter"
+ DOCUMENT_TYPE_OTHER = "other"
+ DOCUMENT_TYPE_CHOICES = (
+ (DOCUMENT_TYPE_ARTICLE, _("Article")),
+ (DOCUMENT_TYPE_PREPRINT, _("Preprint")),
+ (DOCUMENT_TYPE_DATASET, _("Dataset")),
+ (DOCUMENT_TYPE_BOOK, _("Book")),
+ (DOCUMENT_TYPE_CHAPTER, _("Chapter")),
+ (DOCUMENT_TYPE_OTHER, _("Other")),
+ )
+
+ collection = models.ForeignKey(
+ Collection,
+ verbose_name=_("Collection"),
+ on_delete=models.CASCADE,
+ blank=False,
+ null=False,
+ db_index=True,
+ )
+
+ source = models.ForeignKey(
+ Source,
+ verbose_name=_("Source"),
+ on_delete=models.CASCADE,
+ related_name="documents",
+ blank=True,
+ null=True,
+ db_index=True,
+ )
+
+ parent_document = models.ForeignKey(
+ "self",
+ verbose_name=_("Parent Document"),
+ on_delete=models.SET_NULL,
+ related_name="child_documents",
+ blank=True,
+ null=True,
+ db_index=True,
+ )
+
+ document_type = models.CharField(
+ verbose_name=_("Document Type"),
+ max_length=32,
+ choices=DOCUMENT_TYPE_CHOICES,
+ blank=False,
+ null=False,
+ db_index=True,
+ )
+
+ document_id = models.CharField(
+ verbose_name=_("Document ID"),
+ max_length=255,
+ blank=False,
+ null=False,
+ db_index=True,
+ )
+
+ scielo_issn = models.CharField(
+ verbose_name=_("SciELO ISSN"),
+ max_length=9,
+ blank=True,
+ null=True,
+ db_index=True,
+ )
+
+ pid_v2 = models.CharField(
+ verbose_name=_("PID V2"),
+ max_length=23,
+ blank=True,
+ null=True,
+ db_index=True,
+ )
+
+ pid_v3 = models.CharField(
+ verbose_name=_("PID V3"),
+ max_length=23,
+ blank=True,
+ null=True,
+ db_index=True,
+ )
+
+ pid_generic = models.CharField(
+ verbose_name=_("PID Generic"),
+ max_length=255,
+ blank=True,
+ null=True,
+ db_index=True,
+ )
+
+ title = models.CharField(
+ verbose_name=_("Document Title"),
+ max_length=500,
+ blank=True,
+ null=True,
+ )
+
+ identifiers = models.JSONField(
+ verbose_name=_("Identifiers"),
+ null=True,
+ blank=True,
+ default=dict,
+ )
+
+ files = models.JSONField(
+ verbose_name=_("Files"),
+ null=True,
+ blank=True,
+ default=dict,
+ )
+
+ default_lang = models.CharField(
+ verbose_name=_("Default Language"),
+ max_length=8,
+ blank=True,
+ null=True,
+ )
+
+ text_langs = models.JSONField(
+ verbose_name=_("Text Languages"),
+ null=True,
+ blank=True,
+ default=list,
+ )
+
+ default_media_format = models.CharField(
+ verbose_name=_("Default Media Format"),
+ max_length=32,
+ blank=True,
+ null=True,
+ )
+
+ processing_date = models.CharField(
+ verbose_name=_("Processing Date"),
+ max_length=32,
+ blank=True,
+ null=True,
+ )
+
+ publication_date = models.CharField(
+ verbose_name=_("Publication Date"),
+ max_length=32,
+ blank=True,
+ null=True,
+ )
+
+ publication_year = models.CharField(
+ verbose_name=_("Publication Year"),
+ max_length=4,
+ blank=True,
+ null=True,
+ db_index=True,
+ )
+
+ extra_data = models.JSONField(
+ verbose_name=_("Extra Data"),
+ null=True,
+ blank=True,
+ default=dict,
+ )
+
+ def __str__(self):
+ return f"{self.collection.acron3} - {self.document_type} - {self.document_id}"
+
+ @classmethod
+ def metadata(cls, collection=None):
+ queryset = cls.objects.select_related("collection", "source").only(
+ "collection__acron3",
+ "default_lang",
+ "default_media_format",
+ "document_id",
+ "document_type",
+ "extra_data",
+ "files",
+ "identifiers",
+ "parent_document__document_id",
+ "pid_generic",
+ "pid_v2",
+ "pid_v3",
+ "processing_date",
+ "publication_date",
+ "publication_year",
+ "scielo_issn",
+ "source__scielo_issn",
+ "source__source_id",
+ "source__source_type",
+ "text_langs",
+ "title",
+ )
+
+ if collection:
+ queryset = queryset.filter(collection=collection)
+
+ for document in queryset.iterator():
+ source = document.source
+ yield {
+ "collection": document.collection.acron3,
+ "default_lang": document.default_lang,
+ "default_media_format": document.default_media_format,
+ "document_id": document.document_id,
+ "document_type": document.document_type,
+ "extra_data": document.extra_data or {},
+ "files": document.files or {},
+ "identifiers": document.identifiers or {},
+ "parent_document_id": (
+ document.parent_document.document_id if document.parent_document else None
+ ),
+ "pid_generic": document.pid_generic,
+ "pid_v2": document.pid_v2,
+ "pid_v3": document.pid_v3,
+ "processing_date": document.processing_date,
+ "publication_date": document.publication_date,
+ "publication_year": document.publication_year,
+ "scielo_issn": document.scielo_issn or (source.scielo_issn if source else None),
+ "source_id": source.source_id if source else None,
+ "source_type": source.source_type if source else None,
+ "text_langs": document.text_langs or [],
+ "title": document.title,
+ }
+
+ class Meta:
+ verbose_name = _("Document")
+ verbose_name_plural = _("Documents")
+ unique_together = (
+ "collection",
+ "document_type",
+ "document_id",
+ )
+ indexes = [
+ models.Index(
+ fields=["collection", "document_type"],
+ name="document_collection_type_idx",
+ ),
+ models.Index(
+ fields=["collection", "scielo_issn"],
+ name="document_collection_issn_idx",
+ ),
+ models.Index(
+ fields=["collection", "pid_v2"],
+ name="document_collection_pidv2_idx",
+ ),
+ models.Index(
+ fields=["collection", "pid_generic"],
+ name="doc_coll_pidgen_idx",
+ ),
+ ]
diff --git a/document/services/__init__.py b/document/services/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/document/services/__init__.py
@@ -0,0 +1 @@
+
diff --git a/document/services/articles.py b/document/services/articles.py
new file mode 100644
index 0000000..09244b3
--- /dev/null
+++ b/document/services/articles.py
@@ -0,0 +1,166 @@
+from document.models import Document
+
+from .common import build_document_id, compact_dict, get_existing_document, normalize_langs, normalize_year
+
+
+def upsert_article_document_from_articlemeta(
+ payload,
+ collection,
+ source=None,
+ user=None,
+ force_update=True,
+):
+ pid_v2 = payload.get("code")
+ document_id = build_document_id(pid_v2, payload.get("pid_v3"), payload.get("pid_generic"))
+ if not document_id:
+ return None
+
+ document = get_existing_document(
+ collection,
+ Document.DOCUMENT_TYPE_ARTICLE,
+ document_id,
+ pid_v2,
+ )
+ created = document is None
+ if created:
+ document = Document(
+ collection=collection,
+ document_type=Document.DOCUMENT_TYPE_ARTICLE,
+ document_id=document_id,
+ )
+ if user:
+ document.creator = user
+
+ if created or force_update:
+ document.source = source
+ document.parent_document = None
+ document.scielo_issn = source.scielo_issn if source else None
+ document.pid_v2 = pid_v2 or document.pid_v2
+ document.pid_v3 = payload.get("pid_v3") or document.pid_v3
+ document.pid_generic = payload.get("pid_generic") or document.pid_generic
+ document.title = payload.get("title") or document.title
+ document.identifiers = _merge_dicts(
+ document.identifiers,
+ _build_articlemeta_identifiers(payload, source),
+ )
+ document.files = payload.get("pdfs") or document.files or {}
+ document.default_lang = payload.get("default_language") or document.default_lang
+ document.text_langs = normalize_langs(payload.get("text_langs"))
+ document.default_media_format = document.default_media_format
+ document.processing_date = payload.get("processing_date") or document.processing_date
+ document.publication_date = payload.get("publication_date") or document.publication_date
+ document.publication_year = normalize_year(
+ payload.get("publication_year"),
+ fallback_date=document.publication_date,
+ )
+ document.extra_data = _merge_dicts(
+ document.extra_data,
+ compact_dict(
+ {
+ "provider": "articlemeta",
+ "issn_codes": payload.get("code_title"),
+ }
+ ),
+ )
+
+ if user:
+ document.updated_by = user
+
+ document.save()
+ return document
+
+
+def upsert_article_document_from_opac(
+ payload,
+ collection,
+ source=None,
+ user=None,
+ force_update=True,
+):
+ pid_v2 = payload.get("pid_v2")
+ pid_v3 = payload.get("pid_v3")
+ document_id = build_document_id(pid_v2, pid_v3, payload.get("pid_generic"))
+ if not document_id:
+ return None
+
+ document = get_existing_document(
+ collection,
+ Document.DOCUMENT_TYPE_ARTICLE,
+ document_id,
+ pid_v2,
+ pid_v3,
+ payload.get("pid_generic"),
+ )
+ created = document is None
+ if created:
+ document = Document(
+ collection=collection,
+ document_type=Document.DOCUMENT_TYPE_ARTICLE,
+ document_id=document_id,
+ )
+ if user:
+ document.creator = user
+
+ if created or force_update:
+ document.source = source
+ document.parent_document = None
+ document.scielo_issn = source.scielo_issn if source else None
+ document.pid_v2 = pid_v2 or document.pid_v2
+ document.pid_v3 = pid_v3 or document.pid_v3
+ document.pid_generic = payload.get("pid_generic") or document.pid_generic
+ document.title = payload.get("title") or document.title
+ document.identifiers = _merge_dicts(
+ document.identifiers,
+ _build_opac_identifiers(payload, source),
+ )
+ document.files = document.files or {}
+ document.default_lang = payload.get("default_language") or document.default_lang
+ document.text_langs = normalize_langs(payload.get("text_langs")) or document.text_langs or []
+ document.default_media_format = document.default_media_format
+ document.processing_date = document.processing_date
+ document.publication_date = payload.get("publication_date") or document.publication_date
+ document.publication_year = normalize_year(
+ payload.get("publication_year"),
+ fallback_date=document.publication_date,
+ )
+ document.extra_data = _merge_dicts(
+ document.extra_data,
+ compact_dict(
+ {
+ "provider": "opac",
+ "journal_acronym": payload.get("journal_acronym"),
+ }
+ ),
+ )
+
+ if user:
+ document.updated_by = user
+
+ document.save()
+ return document
+
+
+def _build_articlemeta_identifiers(payload, source):
+ return compact_dict(
+ {
+ "pid_v2": payload.get("code"),
+ "scielo_issn": source.scielo_issn if source else None,
+ }
+ )
+
+
+def _build_opac_identifiers(payload, source):
+ return compact_dict(
+ {
+ "pid_v2": payload.get("pid_v2"),
+ "pid_v3": payload.get("pid_v3"),
+ "scielo_issn": source.scielo_issn if source else None,
+ "journal_acronym": payload.get("journal_acronym"),
+ }
+ )
+
+
+def _merge_dicts(current, new_values):
+ merged = dict(current or {})
+ merged.update(new_values or {})
+ return merged
diff --git a/document/services/books.py b/document/services/books.py
new file mode 100644
index 0000000..96d92e1
--- /dev/null
+++ b/document/services/books.py
@@ -0,0 +1,256 @@
+from document.models import Document
+
+
+def build_book_pid_generic(book_id):
+ if book_id in (None, ""):
+ return None
+ return f"book:{book_id}"
+
+
+def build_chapter_pid_generic(book_id, chapter_id):
+ if book_id in (None, "") or chapter_id in (None, ""):
+ return None
+ return f"book:{book_id}/chapter:{chapter_id}"
+
+
+def enrich_part_payload(payload, monograph_payload):
+ if not monograph_payload:
+ return payload
+
+ enriched = dict(payload)
+ enriched["monograph_title"] = monograph_payload.get("title")
+ enriched["monograph_language"] = monograph_payload.get("language")
+ enriched["monograph_publication_date"] = monograph_payload.get("publication_date")
+ enriched["monograph_year"] = monograph_payload.get("year")
+ enriched["monograph_publisher"] = monograph_payload.get("publisher")
+ enriched["monograph_isbn"] = monograph_payload.get("isbn")
+ enriched["monograph_eisbn"] = monograph_payload.get("eisbn")
+ enriched["monograph_doi_number"] = monograph_payload.get("doi_number")
+ enriched["monograph_creators"] = monograph_payload.get("creators")
+ return enriched
+
+
+def upsert_monograph_document(
+ payload,
+ collection,
+ source=None,
+ user=None,
+ force_update=True,
+ source_url=None,
+ last_seq=None,
+):
+ if payload.get("TYPE") != "Monograph":
+ return None
+
+ book_id = str(payload.get("id"))
+ pid_generic = build_book_pid_generic(book_id)
+ document, created = Document.objects.get_or_create(
+ collection=collection,
+ document_type=Document.DOCUMENT_TYPE_BOOK,
+ document_id=pid_generic,
+ )
+
+ if created and user:
+ document.creator = user
+
+ if created or force_update:
+ document.source = source
+ document.parent_document = None
+ document.scielo_issn = None
+ document.pid_v2 = None
+ document.pid_v3 = None
+ document.pid_generic = pid_generic
+ document.title = payload.get("title") or book_id
+ document.identifiers = _build_monograph_identifiers(payload)
+ document.files = {}
+ document.default_lang = payload.get("language") or None
+ document.text_langs = _unique_list(payload.get("language"))
+ document.default_media_format = None
+ document.processing_date = None
+ document.publication_date = payload.get("publication_date") or None
+ document.publication_year = _normalize_year(payload.get("year"))
+ document.extra_data = _build_monograph_extra_data(
+ payload,
+ source_url=source_url,
+ last_seq=last_seq,
+ )
+
+ if user:
+ document.updated_by = user
+
+ document.save()
+ return document
+
+
+def upsert_part_document(
+ payload,
+ collection,
+ source=None,
+ parent_document=None,
+ user=None,
+ force_update=True,
+ source_url=None,
+ last_seq=None,
+):
+ if payload.get("TYPE") != "Part":
+ return None
+
+ book_id = payload.get("monograph")
+ chapter_id = payload.get("id")
+ pid_generic = build_chapter_pid_generic(book_id, chapter_id)
+ document, created = Document.objects.get_or_create(
+ collection=collection,
+ document_type=Document.DOCUMENT_TYPE_CHAPTER,
+ document_id=pid_generic,
+ )
+
+ if created and user:
+ document.creator = user
+
+ if created or force_update:
+ document.source = source
+ document.parent_document = parent_document
+ document.scielo_issn = None
+ document.pid_v2 = None
+ document.pid_v3 = None
+ document.pid_generic = pid_generic
+ document.title = payload.get("title") or str(chapter_id)
+ document.identifiers = _build_part_identifiers(payload)
+ document.files = {}
+ document.default_lang = (
+ payload.get("text_language")
+ or payload.get("monograph_language")
+ or None
+ )
+ document.text_langs = _unique_list(
+ payload.get("text_language") or payload.get("monograph_language")
+ )
+ document.default_media_format = None
+ document.processing_date = None
+ document.publication_date = payload.get("monograph_publication_date") or None
+ document.publication_year = _normalize_year(payload.get("monograph_year"))
+ document.extra_data = _build_part_extra_data(
+ payload,
+ source_url=source_url,
+ last_seq=last_seq,
+ )
+
+ if user:
+ document.updated_by = user
+
+ document.save()
+ return document
+
+
+def delete_book_document(collection, book_id):
+ return Document.objects.filter(
+ collection=collection,
+ document_type=Document.DOCUMENT_TYPE_BOOK,
+ document_id=build_book_pid_generic(book_id),
+ ).delete()
+
+
+def delete_document_by_raw_id(collection, raw_id):
+ return Document.objects.filter(
+ collection=collection,
+ extra_data__raw_id=str(raw_id),
+ ).delete()
+
+
+def has_monograph_document_for_raw_id(collection, raw_id):
+ return Document.objects.filter(
+ collection=collection,
+ document_type=Document.DOCUMENT_TYPE_BOOK,
+ extra_data__raw_id=str(raw_id),
+ ).exists()
+
+
+def get_monograph_document(collection, book_id):
+ return Document.objects.filter(
+ collection=collection,
+ document_type=Document.DOCUMENT_TYPE_BOOK,
+ document_id=build_book_pid_generic(book_id),
+ ).first()
+
+
+def _build_monograph_identifiers(payload):
+ identifiers = {
+ "book_id": str(payload.get("id")) if payload.get("id") is not None else None,
+ "isbn": payload.get("isbn"),
+ "eisbn": payload.get("eisbn"),
+ "doi": payload.get("doi_number"),
+ }
+ return _compact_dict(identifiers)
+
+
+def _build_part_identifiers(payload):
+ identifiers = {
+ "book_id": str(payload.get("monograph")) if payload.get("monograph") is not None else None,
+ "chapter_id": str(payload.get("id")) if payload.get("id") is not None else None,
+ "isbn": payload.get("monograph_isbn"),
+ "eisbn": payload.get("monograph_eisbn"),
+ "doi": payload.get("doi_number"),
+ "book_doi": payload.get("monograph_doi_number"),
+ }
+ return _compact_dict(identifiers)
+
+
+def _build_monograph_extra_data(payload, source_url=None, last_seq=None):
+ extra_data = {
+ "raw_id": str(payload.get("id")) if payload.get("id") is not None else None,
+ "raw_type": payload.get("TYPE"),
+ "source_url": source_url,
+ "last_seq": last_seq,
+ "visible": payload.get("visible"),
+ "city": payload.get("city"),
+ "country": payload.get("country"),
+ "pages": payload.get("pages"),
+ "publisher": payload.get("publisher"),
+ "creators": payload.get("creators"),
+ "translated_titles": payload.get("translated_titles"),
+ "translated_synopses": payload.get("translated_synopses"),
+ "synopsis": payload.get("synopsis"),
+ }
+ return _compact_dict(extra_data)
+
+
+def _build_part_extra_data(payload, source_url=None, last_seq=None):
+ extra_data = {
+ "raw_id": str(payload.get("id")) if payload.get("id") is not None else None,
+ "raw_type": payload.get("TYPE"),
+ "source_url": source_url,
+ "last_seq": last_seq,
+ "visible": payload.get("visible"),
+ "order": payload.get("order"),
+ "pages": payload.get("pages"),
+ "creators": payload.get("creators"),
+ "translated_titles": payload.get("translated_titles"),
+ "monograph_id": str(payload.get("monograph")) if payload.get("monograph") is not None else None,
+ "monograph_title": payload.get("monograph_title"),
+ "monograph_language": payload.get("monograph_language"),
+ "monograph_publication_date": payload.get("monograph_publication_date"),
+ "monograph_year": payload.get("monograph_year"),
+ "monograph_publisher": payload.get("monograph_publisher"),
+ "monograph_creators": payload.get("monograph_creators"),
+ }
+ return _compact_dict(extra_data)
+
+
+def _unique_list(value):
+ if not value:
+ return []
+ return [value]
+
+
+def _normalize_year(value):
+ if value in (None, ""):
+ return None
+ return str(value)[:4]
+
+
+def _compact_dict(data):
+ return {
+ key: value
+ for key, value in data.items()
+ if value not in (None, "", [], {}, ())
+ }
diff --git a/document/services/common.py b/document/services/common.py
new file mode 100644
index 0000000..91e103d
--- /dev/null
+++ b/document/services/common.py
@@ -0,0 +1,58 @@
+from document.models import Document
+
+
+def build_document_id(*values):
+ for value in values:
+ if value not in (None, ""):
+ return str(value)
+ return None
+
+
+def get_existing_document(collection, document_type, *identifiers):
+ identifiers = [str(value) for value in identifiers if value not in (None, "")]
+ if not identifiers:
+ return None
+
+ queryset = Document.objects.filter(
+ collection=collection,
+ document_type=document_type,
+ )
+
+ for field_name in ("document_id", "pid_v2", "pid_v3", "pid_generic"):
+ for identifier in identifiers:
+ document = queryset.filter(**{field_name: identifier}).first()
+ if document:
+ return document
+
+ return None
+
+
+def normalize_langs(value):
+ if not value:
+ return []
+
+ if isinstance(value, list):
+ return [item for item in value if item not in (None, "")]
+
+ if isinstance(value, dict):
+ return [key for key, enabled in value.items() if enabled]
+
+ return [value]
+
+
+def normalize_year(value, fallback_date=None):
+ if value not in (None, ""):
+ return str(value)[:4]
+
+ if fallback_date not in (None, ""):
+ return str(fallback_date)[:4]
+
+ return None
+
+
+def compact_dict(data):
+ return {
+ key: value
+ for key, value in data.items()
+ if value not in (None, "", [], {}, ())
+ }
diff --git a/document/services/datasets.py b/document/services/datasets.py
new file mode 100644
index 0000000..2496b20
--- /dev/null
+++ b/document/services/datasets.py
@@ -0,0 +1,69 @@
+from document.models import Document
+
+from .common import compact_dict, normalize_year
+
+
+def upsert_dataset_document(
+ payload,
+ collection,
+ user=None,
+ force_update=True,
+):
+ dataset_doi = payload.get("dataset_doi")
+ if not dataset_doi:
+ return None
+
+ document, created = Document.objects.get_or_create(
+ collection=collection,
+ document_type=Document.DOCUMENT_TYPE_DATASET,
+ document_id=dataset_doi,
+ )
+
+ if created and user:
+ document.creator = user
+
+ if created or force_update:
+ files = dict(document.files or {})
+ file_id = payload.get("file_id")
+ if file_id:
+ files[str(file_id)] = compact_dict(
+ {
+ "name": payload.get("file_name"),
+ "url": payload.get("file_url"),
+ "file_persistent_id": payload.get("file_persistent_id"),
+ }
+ )
+
+ document.source = None
+ document.parent_document = None
+ document.scielo_issn = None
+ document.pid_v2 = None
+ document.pid_v3 = None
+ document.pid_generic = dataset_doi
+ document.title = payload.get("title") or document.title
+ document.identifiers = compact_dict(
+ {
+ "dataset_doi": dataset_doi,
+ }
+ )
+ document.files = files
+ document.default_lang = document.default_lang
+ document.text_langs = document.text_langs or []
+ document.default_media_format = document.default_media_format
+ document.processing_date = document.processing_date
+ document.publication_date = payload.get("dataset_published") or document.publication_date
+ document.publication_year = normalize_year(
+ None,
+ fallback_date=document.publication_date,
+ )
+ document.extra_data = compact_dict(
+ {
+ "provider": "dataverse",
+ }
+ )
+
+ if user:
+ document.updated_by = user
+
+ document.save()
+ return document
diff --git a/document/services/preprints.py b/document/services/preprints.py
new file mode 100644
index 0000000..4be89f1
--- /dev/null
+++ b/document/services/preprints.py
@@ -0,0 +1,58 @@
+from document.models import Document
+
+from .common import compact_dict, normalize_langs, normalize_year
+
+
+def upsert_preprint_document(
+ payload,
+ collection,
+ user=None,
+ force_update=True,
+):
+ pid_generic = payload.get("pid_generic")
+ if not pid_generic:
+ return None
+
+ document, created = Document.objects.get_or_create(
+ collection=collection,
+ document_type=Document.DOCUMENT_TYPE_PREPRINT,
+ document_id=pid_generic,
+ )
+
+ if created and user:
+ document.creator = user
+
+ if created or force_update:
+ document.source = None
+ document.parent_document = None
+ document.scielo_issn = None
+ document.pid_v2 = None
+ document.pid_v3 = None
+ document.pid_generic = pid_generic
+ document.title = payload.get("title") or document.title
+ document.identifiers = compact_dict(
+ {
+ "pid_generic": pid_generic,
+ }
+ )
+ document.files = document.files or {}
+ document.default_lang = payload.get("default_language") or document.default_lang
+ document.text_langs = normalize_langs(payload.get("text_langs"))
+ document.default_media_format = document.default_media_format
+ document.processing_date = document.processing_date
+ document.publication_date = payload.get("publication_date") or document.publication_date
+ document.publication_year = normalize_year(
+ payload.get("publication_year"),
+ fallback_date=document.publication_date,
+ )
+ document.extra_data = compact_dict(
+ {
+ "provider": "preprints",
+ }
+ )
+
+ if user:
+ document.updated_by = user
+
+ document.save()
+ return document
diff --git a/document/tasks/__init__.py b/document/tasks/__init__.py
new file mode 100644
index 0000000..95a0ba5
--- /dev/null
+++ b/document/tasks/__init__.py
@@ -0,0 +1,28 @@
+from .articlemeta import (
+ load_documents_from_article_meta,
+ task_load_documents_from_article_meta,
+)
+from .common import (
+ get_latest_scielo_books_last_seq,
+)
+from .dataverse import (
+ load_dataset_metadata_from_dataverse,
+ task_load_dataset_metadata_into_documents,
+)
+from .opac import (
+ load_documents_from_opac,
+ task_load_documents_from_opac,
+)
+from .pipeline import (
+ task_daily_metadata_sync_pipeline,
+)
+from .preprints import (
+ load_preprints_from_preprints_api,
+ task_load_preprints_into_documents,
+)
+from .scielo_books import (
+ load_documents_from_scielo_books,
+ sync_documents_from_scielo_books,
+ task_load_documents_from_scielo_books,
+ task_sync_documents_from_scielo_books,
+)
diff --git a/document/tasks/articlemeta.py b/document/tasks/articlemeta.py
new file mode 100644
index 0000000..75b2689
--- /dev/null
+++ b/document/tasks/articlemeta.py
@@ -0,0 +1,120 @@
+import logging
+
+from django.db import DataError
+from django.utils.translation import gettext as _
+
+from core.collectors import articlemeta as articlemeta_collector
+from core.utils import date_utils
+from core.utils.request_utils import _get_user
+from document.services import articles as article_service
+from source.services import journals as journal_service
+
+from config import celery_app
+
+from .common import _get_collection
+
+
+def load_documents_from_article_meta(
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ collection=None,
+ issn=None,
+ force_update=True,
+ user=None,
+):
+ from_date, until_date = date_utils.get_date_range_str(
+ from_date,
+ until_date,
+ days_to_go_back,
+ )
+ logging.info(
+ "Loading documents from Article Meta. From: %s, Until: %s, Collection: %s, ISSN: %s",
+ from_date,
+ until_date,
+ collection,
+ issn,
+ )
+
+ offset = 0
+ limit = 1000
+ while True:
+ response = articlemeta_collector.fetch_article_counter_dict(
+ from_date,
+ until_date,
+ offset=offset,
+ limit=limit,
+ collection=collection,
+ issn=issn,
+ )
+ objects = response.get("objects") or []
+ if not objects:
+ break
+
+ for payload in objects:
+ collection_obj = _get_collection(payload.get("collection") or collection)
+ if not collection_obj:
+ logging.info(
+ "Collection not found for payload %s",
+ payload.get("code"),
+ )
+ continue
+
+ source = journal_service.find_journal_source_by_issns(
+ collection_obj,
+ payload.get("code_title"),
+ )
+ if not source:
+ logging.info(
+ "Source not found for collection %s and ISSNs %s",
+ collection_obj.acron3,
+ payload.get("code_title"),
+ )
+ continue
+
+ try:
+ article_service.upsert_article_document_from_articlemeta(
+ payload,
+ collection=collection_obj,
+ source=source,
+ user=user,
+ force_update=force_update,
+ )
+ except DataError as exc:
+ logging.error(
+ "Error saving Document from Article Meta. "
+ "Collection: %s, Source: %s, PIDv2: %s. Error: %s",
+ collection_obj,
+ source.source_id,
+ payload.get('code'),
+ exc
+ )
+ continue
+
+ offset += limit
+
+ return True
+
+
+@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (Article Meta)"), timelimit=-1, queue="load")
+def task_load_documents_from_article_meta(
+ self,
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ collection=None,
+ issn=None,
+ force_update=True,
+ user_id=None,
+ username=None,
+):
+ user = _get_user(self.request, username=username, user_id=user_id)
+ return load_documents_from_article_meta(
+ from_date=from_date,
+ until_date=until_date,
+ days_to_go_back=days_to_go_back,
+ collection=collection,
+ issn=issn,
+ force_update=force_update,
+ user=user,
+ )
diff --git a/document/tasks/common.py b/document/tasks/common.py
new file mode 100644
index 0000000..1645918
--- /dev/null
+++ b/document/tasks/common.py
@@ -0,0 +1,43 @@
+import logging
+
+from collection.models import Collection
+from document.models import Document
+from source.models import Source
+
+
+def _get_collection(acronym):
+ if not acronym:
+ return None
+ return Collection.objects.filter(acron3=acronym).first()
+
+
+def get_latest_scielo_books_last_seq(collection="books"):
+ document_last_seq = _get_latest_last_seq_from_queryset(
+ Document.objects.filter(collection__acron3=collection).only("extra_data")
+ )
+ source_last_seq = _get_latest_last_seq_from_queryset(
+ Source.objects.filter(
+ collection__acron3=collection,
+ source_type=Source.SOURCE_TYPE_BOOK,
+ ).only("extra_data")
+ )
+ return max(document_last_seq, source_last_seq)
+
+
+def _get_latest_last_seq_from_queryset(queryset):
+ latest = 0
+ for item in queryset.iterator():
+ value = _coerce_last_seq((item.extra_data or {}).get("last_seq"))
+ if value is not None and value > latest:
+ latest = value
+ return latest
+
+
+def _coerce_last_seq(value):
+ if value in (None, ""):
+ return None
+ try:
+ return int(value)
+ except (TypeError, ValueError):
+ logging.warning("Ignoring invalid SciELO Books last_seq value: %r", value)
+ return None
diff --git a/document/tasks/dataverse.py b/document/tasks/dataverse.py
new file mode 100644
index 0000000..15618a5
--- /dev/null
+++ b/document/tasks/dataverse.py
@@ -0,0 +1,80 @@
+import logging
+
+from django.db import DataError
+from django.utils.translation import gettext as _
+
+from core.collectors import dataverse as dataverse_collector
+from core.utils import date_utils
+from core.utils.request_utils import _get_user
+from document.services import datasets as dataset_service
+
+from config import celery_app
+
+from .common import _get_collection
+
+
+def load_dataset_metadata_from_dataverse(
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ force_update=True,
+ user=None,
+):
+ from_date, until_date = date_utils.get_date_range_str(
+ from_date,
+ until_date,
+ days_to_go_back,
+ )
+ logging.info(
+ "Loading dataset metadata into documents. From: %s, Until: %s",
+ from_date,
+ until_date,
+ )
+
+ collection_obj = _get_collection("data")
+ if not collection_obj:
+ logging.error("Collection not found: data")
+ return False
+
+ for payload in dataverse_collector.iter_dataset_metadata(from_date, until_date):
+ if not payload.get("dataset_doi"):
+ logging.error("Dataset DOI not found in record: %s", payload)
+ continue
+
+ try:
+ dataset_service.upsert_dataset_document(
+ payload,
+ collection=collection_obj,
+ user=user,
+ force_update=force_update,
+ )
+ except DataError as exc:
+ logging.error(
+ "Error saving Dataset Document. Collection: %s, PID: %s. Error: %s",
+ collection_obj,
+ payload.get('dataset_doi'),
+ exc
+ )
+ continue
+
+ return True
+
+
+@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (Dataverse)"), timelimit=-1, queue="load")
+def task_load_dataset_metadata_into_documents(
+ self,
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ force_update=True,
+ user_id=None,
+ username=None,
+):
+ user = _get_user(self.request, username=username, user_id=user_id)
+ return load_dataset_metadata_from_dataverse(
+ from_date=from_date,
+ until_date=until_date,
+ days_to_go_back=days_to_go_back,
+ force_update=force_update,
+ user=user,
+ )
diff --git a/document/tasks/opac.py b/document/tasks/opac.py
new file mode 100644
index 0000000..5e1c81e
--- /dev/null
+++ b/document/tasks/opac.py
@@ -0,0 +1,107 @@
+import logging
+
+from django.db import DataError
+from django.utils.translation import gettext as _
+
+from core.collectors import opac as opac_collector
+from core.utils import date_utils
+from core.utils.request_utils import _get_user
+from document.services import articles as article_service
+from source.services import journals as journal_service
+
+from config import celery_app
+
+from .common import _get_collection
+
+
+def load_documents_from_opac(
+ collection="scl",
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ page=1,
+ force_update=True,
+ user=None,
+):
+ from_date, until_date = date_utils.get_date_range_str(
+ from_date,
+ until_date,
+ days_to_go_back,
+ )
+ logging.info(
+ "Loading documents from OPAC. From: %s, Until: %s, Collection: %s",
+ from_date,
+ until_date,
+ collection,
+ )
+
+ collection_obj = _get_collection(collection)
+ if not collection_obj:
+ logging.error("Collection not found: %s", collection)
+ return False
+
+ while True:
+ response = opac_collector.fetch_counter_dict(from_date, until_date, page=page)
+ documents = response.get("documents") or {}
+
+ for payload in documents.values():
+ source = journal_service.find_journal_source_by_acronym(
+ collection_obj,
+ payload.get("journal_acronym"),
+ )
+ if not source:
+ logging.info(
+ "Source not found for collection %s and acronym %s",
+ collection_obj.acron3,
+ payload.get("journal_acronym"),
+ )
+ continue
+
+ try:
+ article_service.upsert_article_document_from_opac(
+ payload,
+ collection=collection_obj,
+ source=source,
+ user=user,
+ force_update=force_update,
+ )
+ except DataError as exc:
+ logging.error(
+ "Error saving Document from OPAC. "
+ "Collection: %s, Source: %s, PIDv2: %s. Error: %s",
+ collection_obj,
+ source.source_id,
+ payload.get('pid_v2'),
+ exc
+ )
+ continue
+
+ page += 1
+ if page > int(response.get("pages", 0)):
+ break
+
+ return True
+
+
+@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (OPAC)"), timelimit=-1, queue="load")
+def task_load_documents_from_opac(
+ self,
+ collection="scl",
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ page=1,
+ force_update=True,
+ user_id=None,
+ username=None,
+):
+ user = _get_user(self.request, username=username, user_id=user_id)
+ return load_documents_from_opac(
+ collection=collection,
+ from_date=from_date,
+ until_date=until_date,
+ days_to_go_back=days_to_go_back,
+ page=page,
+ force_update=force_update,
+ user=user,
+ )
diff --git a/document/tasks/pipeline.py b/document/tasks/pipeline.py
new file mode 100644
index 0000000..97bef7c
--- /dev/null
+++ b/document/tasks/pipeline.py
@@ -0,0 +1,24 @@
+import logging
+
+from celery import group
+from django.utils.translation import gettext as _
+
+from config import celery_app
+
+from .articlemeta import task_load_documents_from_article_meta
+from .dataverse import task_load_dataset_metadata_into_documents
+from .opac import task_load_documents_from_opac
+from .preprints import task_load_preprints_into_documents
+from .scielo_books import task_sync_documents_from_scielo_books
+
+
+@celery_app.task(bind=True, name=_("[Metadata] Daily Sync Routine (Auto)"), queue="load")
+def task_daily_metadata_sync_pipeline(self):
+ logging.info("Starting Daily Metadata Sync Pipeline")
+ group([
+ task_load_documents_from_article_meta.s(),
+ task_load_documents_from_opac.s(),
+ task_load_preprints_into_documents.s(),
+ task_load_dataset_metadata_into_documents.s(),
+ task_sync_documents_from_scielo_books.s(),
+ ]).apply_async()
diff --git a/document/tasks/preprints.py b/document/tasks/preprints.py
new file mode 100644
index 0000000..ee63211
--- /dev/null
+++ b/document/tasks/preprints.py
@@ -0,0 +1,82 @@
+import logging
+
+from django.db import DataError
+from django.utils.translation import gettext as _
+
+from core.collectors import preprints as preprints_collector
+from core.utils import date_utils
+from core.utils.request_utils import _get_user
+from document.services import preprints as preprint_service
+
+from config import celery_app
+
+from .common import _get_collection
+
+
+def load_preprints_from_preprints_api(
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ force_update=True,
+ user=None,
+):
+ from_date, until_date = date_utils.get_date_range_str(
+ from_date,
+ until_date,
+ days_to_go_back,
+ )
+ logging.info(
+ "Loading preprints into documents. From: %s, Until: %s",
+ from_date,
+ until_date,
+ )
+
+ collection_obj = _get_collection("preprints")
+ if not collection_obj:
+ logging.error("Collection not found: preprints")
+ return False
+
+ for record in preprints_collector.iter_records(from_date, until_date):
+ payload = preprints_collector.extract_record_data(record)
+
+ if not payload.get("pid_generic"):
+ logging.error("Preprint ID not found in record: %s", record)
+ continue
+
+ try:
+ preprint_service.upsert_preprint_document(
+ payload,
+ collection=collection_obj,
+ user=user,
+ force_update=force_update,
+ )
+ except DataError as exc:
+ logging.error(
+ "Error saving Preprint Document. Collection: %s, PID: %s. Error: %s",
+ collection_obj,
+ payload.get('pid_generic'),
+ exc
+ )
+ continue
+
+ return True
+
+
+@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (Preprints)"), timelimit=-1, queue="load")
+def task_load_preprints_into_documents(
+ self,
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ force_update=True,
+ user_id=None,
+ username=None,
+):
+ user = _get_user(self.request, username=username, user_id=user_id)
+ return load_preprints_from_preprints_api(
+ from_date=from_date,
+ until_date=until_date,
+ days_to_go_back=days_to_go_back,
+ force_update=force_update,
+ user=user,
+ )
diff --git a/document/tasks/scielo_books.py b/document/tasks/scielo_books.py
new file mode 100644
index 0000000..ddbd462
--- /dev/null
+++ b/document/tasks/scielo_books.py
@@ -0,0 +1,247 @@
+import logging
+
+from django.conf import settings
+from django.utils.translation import gettext as _
+
+from core.collectors import scielo_books as scielo_books_collector
+from core.utils.request_utils import _get_user
+from document.services import books as document_books_service
+from source.services import books as source_books_service
+
+from config import celery_app
+
+from .common import get_latest_scielo_books_last_seq
+
+
+def load_documents_from_scielo_books(
+ collection="books",
+ db_name=None,
+ since=0,
+ limit=None,
+ force_update=True,
+ headers=None,
+ base_url=None,
+ user=None,
+):
+ db_name = db_name or settings.SCIELO_BOOKS_DB_NAME
+ limit = limit or settings.SCIELO_BOOKS_LIMIT
+ collection_obj = source_books_service.get_books_collection(collection)
+ monograph_cache = {}
+
+ logging.info(
+ "Loading documents from SciELO Books. Collection: %s, DB: %s, Since: %s, Limit: %s",
+ collection,
+ db_name,
+ since,
+ limit,
+ )
+
+ for item in scielo_books_collector.iter_change_documents(
+ base_url=base_url,
+ db_name=db_name,
+ since=since,
+ limit=limit,
+ headers=headers,
+ ):
+ change = item["change"]
+ raw_id = change.get("id")
+
+ if item["deleted"]:
+ delete_source = document_books_service.has_monograph_document_for_raw_id(
+ collection_obj,
+ raw_id,
+ )
+ document_books_service.delete_document_by_raw_id(collection_obj, raw_id)
+ if delete_source:
+ source_books_service.delete_book_source(collection_obj, raw_id)
+ continue
+
+ payload = item["payload"] or {}
+ source_url = item.get("source_url")
+ last_seq = change.get("seq")
+
+ if payload.get("TYPE") == "Monograph":
+ source = source_books_service.upsert_monograph_source(
+ payload,
+ collection=collection_obj,
+ user=user,
+ force_update=force_update,
+ source_url=source_url,
+ last_seq=last_seq,
+ )
+ document_books_service.upsert_monograph_document(
+ payload,
+ collection=collection_obj,
+ source=source,
+ user=user,
+ force_update=force_update,
+ source_url=source_url,
+ last_seq=last_seq,
+ )
+ monograph_cache[str(payload.get("id"))] = payload
+ continue
+
+ if payload.get("TYPE") != "Part":
+ continue
+
+ monograph_payload = _get_monograph_payload(
+ payload,
+ monograph_cache=monograph_cache,
+ base_url=base_url,
+ db_name=db_name,
+ headers=headers,
+ )
+ if not monograph_payload:
+ logging.warning(
+ "Skipping part %s because monograph %s could not be loaded.",
+ payload.get("id"),
+ payload.get("monograph"),
+ )
+ continue
+
+ source = source_books_service.upsert_monograph_source(
+ monograph_payload,
+ collection=collection_obj,
+ user=user,
+ force_update=force_update,
+ source_url=None,
+ last_seq=last_seq,
+ )
+ parent_document = document_books_service.upsert_monograph_document(
+ monograph_payload,
+ collection=collection_obj,
+ source=source,
+ user=user,
+ force_update=force_update,
+ source_url=None,
+ last_seq=last_seq,
+ )
+ enriched_payload = document_books_service.enrich_part_payload(
+ payload,
+ monograph_payload,
+ )
+ document_books_service.upsert_part_document(
+ enriched_payload,
+ collection=collection_obj,
+ source=source,
+ parent_document=parent_document,
+ user=user,
+ force_update=force_update,
+ source_url=source_url,
+ last_seq=last_seq,
+ )
+
+ return True
+
+
+def sync_documents_from_scielo_books(
+ collection="books",
+ db_name=None,
+ limit=None,
+ force_update=True,
+ headers=None,
+ base_url=None,
+ user=None,
+):
+ db_name = db_name or settings.SCIELO_BOOKS_DB_NAME
+ limit = limit or settings.SCIELO_BOOKS_LIMIT
+ since = get_latest_scielo_books_last_seq(collection=collection)
+ logging.info(
+ "Syncing documents from SciELO Books incrementally. Collection: %s, Since: %s, Limit: %s",
+ collection,
+ since,
+ limit,
+ )
+ return load_documents_from_scielo_books(
+ collection=collection,
+ db_name=db_name,
+ since=since,
+ limit=limit,
+ force_update=force_update,
+ headers=headers,
+ base_url=base_url,
+ user=user,
+ )
+
+
+@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (SciELO Books - Manual)"), queue="load")
+def task_load_documents_from_scielo_books(
+ self,
+ collection="books",
+ db_name=None,
+ since=0,
+ limit=None,
+ force_update=True,
+ headers=None,
+ base_url=None,
+ user_id=None,
+ username=None,
+):
+ db_name = db_name or settings.SCIELO_BOOKS_DB_NAME
+ limit = limit or settings.SCIELO_BOOKS_LIMIT
+ user = _get_user(self.request, username=username, user_id=user_id)
+ return load_documents_from_scielo_books(
+ collection=collection,
+ db_name=db_name,
+ since=since,
+ limit=limit,
+ force_update=force_update,
+ headers=headers,
+ base_url=base_url,
+ user=user,
+ )
+
+
+@celery_app.task(bind=True, name=_("[Metadata] Sync Documents (SciELO Books - Incremental)"), queue="load")
+def task_sync_documents_from_scielo_books(
+ self,
+ collection="books",
+ db_name=None,
+ limit=None,
+ force_update=True,
+ headers=None,
+ base_url=None,
+ user_id=None,
+ username=None,
+):
+ db_name = db_name or settings.SCIELO_BOOKS_DB_NAME
+ limit = limit or settings.SCIELO_BOOKS_LIMIT
+ user = _get_user(self.request, username=username, user_id=user_id)
+ return sync_documents_from_scielo_books(
+ collection=collection,
+ db_name=db_name,
+ limit=limit,
+ force_update=force_update,
+ headers=headers,
+ base_url=base_url,
+ user=user,
+ )
+
+
+def _get_monograph_payload(payload, monograph_cache, base_url=None, db_name=None, headers=None):
+ monograph_id = payload.get("monograph")
+ if not monograph_id:
+ return None
+
+ monograph_key = str(monograph_id)
+ if monograph_key in monograph_cache:
+ return monograph_cache[monograph_key]
+
+ try:
+ monograph_payload, _ = scielo_books_collector.fetch_document(
+ doc_id=monograph_id,
+ base_url=base_url,
+ db_name=db_name or settings.SCIELO_BOOKS_DB_NAME,
+ headers=headers,
+ )
+ except Exception as exc:
+ logging.warning(
+ "Failed to fetch monograph %s for part %s: %s",
+ monograph_id,
+ payload.get("id"),
+ exc,
+ )
+ return None
+
+ monograph_cache[monograph_key] = monograph_payload
+ return monograph_payload
diff --git a/document/tests.py b/document/tests.py
new file mode 100644
index 0000000..14d9bcd
--- /dev/null
+++ b/document/tests.py
@@ -0,0 +1,255 @@
+from django.test import TestCase
+from unittest.mock import patch
+
+from collection.models import Collection
+from document import tasks as document_tasks
+from source.services import books as source_books_service
+from source.models import Source
+
+from .models import Document
+from .services import articles as article_service
+from .services import books as books_service
+from .services import datasets as dataset_service
+from .services import preprints as preprint_service
+
+
+class DocumentMetadataTests(TestCase):
+ def test_metadata_includes_source_context_and_legacy_identifiers(self):
+ collection = Collection.objects.create(acron3="scl", acron2="sc")
+ source = Source.objects.create(
+ collection=collection,
+ source_type=Source.SOURCE_TYPE_JOURNAL,
+ source_id="1234-5678",
+ scielo_issn="1234-5678",
+ title="Test Journal",
+ identifiers={"scielo_issn": "1234-5678"},
+ )
+ Document.objects.create(
+ collection=collection,
+ source=source,
+ document_type=Document.DOCUMENT_TYPE_ARTICLE,
+ document_id="S123456782024000100001",
+ scielo_issn="1234-5678",
+ pid_v2="S123456782024000100001",
+ pid_v3="abc123",
+ title="Test Article",
+ identifiers={"doi": "10.1590/example"},
+ files={"pt": {"path": "/pdf/test.pdf"}},
+ default_lang="en",
+ text_langs=["en", "pt"],
+ publication_date="2024-01-15",
+ publication_year="2024",
+ )
+
+ metadata = list(Document.metadata(collection=collection))
+
+ self.assertEqual(len(metadata), 1)
+ self.assertEqual(metadata[0]["document_type"], Document.DOCUMENT_TYPE_ARTICLE)
+ self.assertEqual(metadata[0]["document_id"], "S123456782024000100001")
+ self.assertEqual(metadata[0]["source_type"], Source.SOURCE_TYPE_JOURNAL)
+ self.assertEqual(metadata[0]["source_id"], "1234-5678")
+ self.assertEqual(metadata[0]["scielo_issn"], "1234-5678")
+
+ def test_upsert_monograph_and_part_documents_from_books_payload(self):
+ collection = Collection.objects.create(acron3="books", acron2="bk")
+ monograph_payload = {
+ "TYPE": "Monograph",
+ "id": "abcd1",
+ "title": "Sample Book",
+ "isbn": "9788578791889",
+ "eisbn": "9788578791880",
+ "doi_number": "10.1234/book",
+ "language": "pt",
+ "publication_date": "2024-05-20",
+ "year": "2024",
+ "publisher": "SciELO Books",
+ }
+ part_payload = {
+ "TYPE": "Part",
+ "id": "18",
+ "monograph": "abcd1",
+ "title": "Chapter 18",
+ "text_language": "es",
+ "order": "18",
+ }
+
+ source = source_books_service.upsert_monograph_source(
+ monograph_payload,
+ collection=collection,
+ )
+ parent_document = books_service.upsert_monograph_document(
+ monograph_payload,
+ collection=collection,
+ source=source,
+ )
+ chapter = books_service.upsert_part_document(
+ books_service.enrich_part_payload(part_payload, monograph_payload),
+ collection=collection,
+ source=source,
+ parent_document=parent_document,
+ )
+
+ self.assertEqual(parent_document.document_type, Document.DOCUMENT_TYPE_BOOK)
+ self.assertEqual(parent_document.document_id, "book:abcd1")
+ self.assertEqual(parent_document.pid_generic, "book:abcd1")
+ self.assertEqual(chapter.document_type, Document.DOCUMENT_TYPE_CHAPTER)
+ self.assertEqual(chapter.document_id, "book:abcd1/chapter:18")
+ self.assertEqual(chapter.parent_document, parent_document)
+ self.assertEqual(chapter.identifiers["book_id"], "abcd1")
+ self.assertEqual(chapter.default_lang, "es")
+
+ def test_articlemeta_and_opac_upsert_same_document(self):
+ collection = Collection.objects.create(acron3="scl", acron2="sc")
+ source = Source.objects.create(
+ collection=collection,
+ source_type=Source.SOURCE_TYPE_JOURNAL,
+ source_id="1234-5678",
+ scielo_issn="1234-5678",
+ acronym="testjou",
+ title="Test Journal",
+ identifiers={"scielo_issn": "1234-5678"},
+ )
+
+ first = article_service.upsert_article_document_from_articlemeta(
+ {
+ "code": "S123456782024000100001",
+ "title": "Article Title",
+ "pdfs": {"en": {"url": "/pdf/en.pdf"}},
+ "processing_date": "2024-02-10",
+ "publication_date": "2024-01-15",
+ "publication_year": "2024",
+ "default_language": "en",
+ "text_langs": ["en", "pt"],
+ "code_title": ["1234-5678"],
+ },
+ collection=collection,
+ source=source,
+ )
+ second = article_service.upsert_article_document_from_opac(
+ {
+ "pid_v2": "S123456782024000100001",
+ "pid_v3": "S1234-56782024000100001",
+ "title": "Article Title",
+ "journal_acronym": "testjou",
+ "publication_date": "2024-01-15",
+ "default_language": "en",
+ "text_langs": ["en", "pt"],
+ },
+ collection=collection,
+ source=source,
+ )
+
+ self.assertEqual(first.pk, second.pk)
+ self.assertEqual(Document.objects.count(), 1)
+ second.refresh_from_db()
+ self.assertEqual(second.pid_v3, "S1234-56782024000100001")
+ self.assertEqual(second.identifiers["journal_acronym"], "testjou")
+
+ def test_upsert_preprint_document_maps_metadata(self):
+ collection = Collection.objects.create(acron3="preprints", acron2="pp")
+
+ document = preprint_service.upsert_preprint_document(
+ {
+ "pid_generic": "preprint/123",
+ "title": "Preprint Title",
+ "text_langs": ["en", "pt"],
+ "default_language": "en",
+ "publication_date": "2024-01-20",
+ "publication_year": "2024",
+ },
+ collection=collection,
+ )
+
+ self.assertEqual(document.document_type, Document.DOCUMENT_TYPE_PREPRINT)
+ self.assertEqual(document.document_id, "preprint/123")
+ self.assertEqual(document.pid_generic, "preprint/123")
+ self.assertEqual(document.default_lang, "en")
+
+ def test_upsert_dataset_document_accumulates_files(self):
+ collection = Collection.objects.create(acron3="data", acron2="dt")
+
+ dataset_service.upsert_dataset_document(
+ {
+ "title": "Dataset Title",
+ "dataset_doi": "10.1234/dataset",
+ "dataset_published": "2024-03-15",
+ "file_id": "1",
+ "file_name": "first.csv",
+ "file_url": "https://example.org/first.csv",
+ "file_persistent_id": "pid:first",
+ },
+ collection=collection,
+ )
+ document = dataset_service.upsert_dataset_document(
+ {
+ "title": "Dataset Title",
+ "dataset_doi": "10.1234/dataset",
+ "dataset_published": "2024-03-15",
+ "file_id": "2",
+ "file_name": "second.csv",
+ "file_url": "https://example.org/second.csv",
+ "file_persistent_id": "pid:second",
+ },
+ collection=collection,
+ )
+
+ self.assertEqual(document.document_type, Document.DOCUMENT_TYPE_DATASET)
+ self.assertEqual(document.document_id, "10.1234/dataset")
+ self.assertEqual(set(document.files.keys()), {"1", "2"})
+
+
+class DocumentBooksSyncTests(TestCase):
+ def test_get_latest_scielo_books_last_seq_uses_documents_and_sources(self):
+ collection = Collection.objects.create(acron3="books", acron2="bk")
+ source = Source.objects.create(
+ collection=collection,
+ source_type=Source.SOURCE_TYPE_BOOK,
+ source_id="book-1",
+ title="Book 1",
+ extra_data={"last_seq": 120},
+ )
+ Document.objects.create(
+ collection=collection,
+ source=source,
+ document_type=Document.DOCUMENT_TYPE_BOOK,
+ document_id="book:book-1",
+ extra_data={"last_seq": "135"},
+ )
+
+ self.assertEqual(document_tasks.get_latest_scielo_books_last_seq("books"), 135)
+
+ def test_sync_documents_from_scielo_books_uses_computed_since(self):
+ collection = Collection.objects.create(acron3="books", acron2="bk")
+ source = Source.objects.create(
+ collection=collection,
+ source_type=Source.SOURCE_TYPE_BOOK,
+ source_id="book-1",
+ title="Book 1",
+ extra_data={"last_seq": 120},
+ )
+ Document.objects.create(
+ collection=collection,
+ source=source,
+ document_type=Document.DOCUMENT_TYPE_BOOK,
+ document_id="book:book-1",
+ extra_data={"last_seq": 135},
+ )
+
+ with patch("document.tasks.scielo_books.load_documents_from_scielo_books", return_value=True) as mocked:
+ result = document_tasks.sync_documents_from_scielo_books(
+ collection="books",
+ db_name="scielobooks_1a",
+ limit=500,
+ )
+
+ self.assertTrue(result)
+ mocked.assert_called_once_with(
+ collection="books",
+ db_name="scielobooks_1a",
+ since=135,
+ limit=500,
+ force_update=True,
+ headers=None,
+ base_url=None,
+ user=None,
+ )
diff --git a/article/wagtail_hooks.py b/document/wagtail_hooks.py
similarity index 50%
rename from article/wagtail_hooks.py
rename to document/wagtail_hooks.py
index 4cf55bd..de291c9 100644
--- a/article/wagtail_hooks.py
+++ b/document/wagtail_hooks.py
@@ -1,39 +1,35 @@
from django.utils.translation import gettext_lazy as _
from wagtail.snippets.views.snippets import SnippetViewSet
-from wagtail.snippets.models import register_snippet
-from config.menu import get_menu_order
+from .models import Document
-from .models import Article
-
-class ArticleSnippetViewSet(SnippetViewSet):
- model = Article
+class DocumentSnippetViewSet(SnippetViewSet):
+ model = Document
icon = "folder-open-inverse"
- menu_name = "article"
- menu_label = _("Article")
- menu_order = get_menu_order("article")
- add_to_admin_menu = True
+ menu_label = _("Document")
+ menu_order = 300
list_display = (
"collection",
- "scielo_issn",
+ "document_type",
+ "document_id",
+ "source",
+ "title",
"pid_v2",
"pid_v3",
"pid_generic",
- "files",
"publication_year",
)
list_filter = (
"collection",
- "scielo_issn",
+ "document_type",
"publication_year",
)
search_fields = (
- "scielo_issn",
+ "document_id",
+ "title",
"pid_v2",
"pid_v3",
"pid_generic",
)
-
-register_snippet(ArticleSnippetViewSet)
diff --git a/journal/__init__.py b/journal/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/journal/admin.py b/journal/admin.py
deleted file mode 100644
index 8c38f3f..0000000
--- a/journal/admin.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from django.contrib import admin
-
-# Register your models here.
diff --git a/journal/migrations/0001_initial.py b/journal/migrations/0001_initial.py
deleted file mode 100644
index 7164bbc..0000000
--- a/journal/migrations/0001_initial.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Generated by Django 5.0.7 on 2025-02-07 17:50
-
-import django.db.models.deletion
-from django.conf import settings
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- initial = True
-
- dependencies = [
- ("collection", "0001_initial"),
- migrations.swappable_dependency(settings.AUTH_USER_MODEL),
- ]
-
- operations = [
- migrations.CreateModel(
- name="Journal",
- fields=[
- (
- "id",
- models.BigAutoField(
- auto_created=True,
- primary_key=True,
- serialize=False,
- verbose_name="ID",
- ),
- ),
- (
- "created",
- models.DateTimeField(
- auto_now_add=True, verbose_name="Creation date"
- ),
- ),
- (
- "updated",
- models.DateTimeField(
- auto_now=True, verbose_name="Last update date"
- ),
- ),
- (
- "scielo_issn",
- models.CharField(max_length=9, verbose_name="SciELO ISSN"),
- ),
- (
- "issns",
- models.JSONField(
- blank=True, default=dict, null=True, verbose_name="ISSNs"
- ),
- ),
- (
- "acronym",
- models.CharField(
- blank=True,
- default="",
- max_length=32,
- null=True,
- verbose_name="Journal Acronym",
- ),
- ),
- (
- "title",
- models.CharField(max_length=255, verbose_name="Journal Title"),
- ),
- (
- "publisher_name",
- models.JSONField(
- blank=True,
- default=list,
- null=True,
- verbose_name="Publisher Name",
- ),
- ),
- (
- "subject_areas",
- models.JSONField(
- default=list, verbose_name="Subject Areas (CAPES)"
- ),
- ),
- (
- "wos_subject_areas",
- models.JSONField(default=list, verbose_name="Subject Areas (WoS)"),
- ),
- (
- "collection",
- models.ForeignKey(
- on_delete=django.db.models.deletion.CASCADE,
- to="collection.collection",
- verbose_name="Collection",
- ),
- ),
- (
- "creator",
- models.ForeignKey(
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_creator",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Creator",
- ),
- ),
- (
- "updated_by",
- models.ForeignKey(
- blank=True,
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_last_mod_user",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Updater",
- ),
- ),
- ],
- options={
- "verbose_name": "Journal",
- "verbose_name_plural": "Journals",
- "unique_together": {("collection", "scielo_issn", "acronym")},
- },
- ),
- ]
diff --git a/journal/migrations/0002_alter_journal_scielo_issn.py b/journal/migrations/0002_alter_journal_scielo_issn.py
deleted file mode 100644
index 07cf94f..0000000
--- a/journal/migrations/0002_alter_journal_scielo_issn.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Generated by Django 5.0.7 on 2025-06-12 17:16
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("journal", "0001_initial"),
- ]
-
- operations = [
- migrations.AlterField(
- model_name="journal",
- name="scielo_issn",
- field=models.CharField(
- db_index=True, max_length=9, verbose_name="SciELO ISSN"
- ),
- ),
- ]
diff --git a/journal/migrations/__init__.py b/journal/migrations/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/journal/models.py b/journal/models.py
deleted file mode 100644
index 0d830e9..0000000
--- a/journal/models.py
+++ /dev/null
@@ -1,100 +0,0 @@
-from django.db import models
-from django.utils.translation import gettext_lazy as _
-
-from core.models import CommonControlField
-from collection.models import Collection
-
-
-class Journal(CommonControlField):
- collection = models.ForeignKey(
- Collection,
- verbose_name=_('Collection'),
- on_delete=models.CASCADE,
- blank=False,
- null=False,
- db_index=True,
- )
-
- scielo_issn = models.CharField(
- verbose_name=_('SciELO ISSN'),
- max_length=9,
- blank=False,
- null=False,
- db_index=True,
- )
-
- issns = models.JSONField(
- verbose_name=_('ISSNs'),
- null=True,
- blank=True,
- default=dict,
- )
-
- acronym = models.CharField(
- verbose_name=_('Journal Acronym'),
- max_length=32,
- blank=True,
- null=True,
- default='',
- )
-
- title = models.CharField(
- verbose_name=_('Journal Title'),
- max_length=255,
- blank=False,
- null=False,
- )
-
- publisher_name = models.JSONField(
- verbose_name=_('Publisher Name'),
- blank=True,
- null=True,
- default=list,
- )
-
- subject_areas = models.JSONField(
- verbose_name=_('Subject Areas (CAPES)'),
- null=False,
- blank=False,
- default=list,
- )
-
- wos_subject_areas = models.JSONField(
- verbose_name=_('Subject Areas (WoS)'),
- null=False,
- blank=False,
- default=list,
- )
-
- def __str__(self):
- return f'{self.collection.acron2} - {self.scielo_issn} - {self.acronym}'
-
- @classmethod
- def metadata(cls, collection=None):
- queryset = cls.objects.all()
- if collection:
- queryset = queryset.filter(collection=collection)
-
- for journal in queryset.only(
- 'acronym', 'collection__acron3', 'issns', 'publisher_name',
- 'scielo_issn', 'subject_areas', 'title', 'wos_subject_areas'
- ):
- yield {
- 'acronym': journal.acronym,
- 'collection': journal.collection.acron3,
- 'issns': set([v for v in journal.issns.values() if v]),
- 'publisher_name': journal.publisher_name,
- 'scielo_issn': journal.scielo_issn,
- 'subject_areas': journal.subject_areas,
- 'title': journal.title,
- 'wos_subject_areas': journal.wos_subject_areas,
- }
-
- class Meta:
- verbose_name = _('Journal')
- verbose_name_plural = _('Journals')
- unique_together = (
- 'collection',
- 'scielo_issn',
- 'acronym',
- )
diff --git a/journal/tasks.py b/journal/tasks.py
deleted file mode 100644
index 71681cb..0000000
--- a/journal/tasks.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import logging
-
-from django.contrib.auth import get_user_model
-from django.db import IntegrityError
-from django.utils import timezone
-from django.utils.translation import gettext as _
-
-from collection.models import Collection
-from config import celery_app
-from core.utils.utils import _get_user
-
-from . import models, utils
-
-
-User = get_user_model()
-
-
-@celery_app.task(bind=True, name=_('Load journal data from Article Meta'), queue='load')
-def task_load_journal_data_from_article_meta(self, collections=[], force_update=True, user_id=None, username=None, mode='thrift'):
- user = _get_user(user_id, username)
-
- for col in collections or Collection.acron3_list():
- for j in utils.fetch_article_meta_journals(collection=col, mode=mode):
- collection = Collection.objects.get(acron3=j.collection_acronym)
- if not collection:
- logging.error(f'Collection {j.collection_acronym} does not exist')
- continue
-
- try:
- journal, created = models.Journal.objects.get_or_create(collection=collection, scielo_issn=j.scielo_issn)
- except IntegrityError as e:
- logging.error(f'Journal {j} has not been created due to error: {e}')
- continue
-
- if created:
- journal.creator = user
- journal.created = timezone.now()
-
- if created or force_update:
- journal.updated_by = user
- journal.updated = timezone.now()
- journal.issns = {
- 'electronic_issn': j.electronic_issn or '',
- 'print_issn': j.print_issn or '',
- 'scielo_issn': j.scielo_issn
- }
- journal.acronym = j.acronym
- journal.title = j.title
- journal.publisher_name = j.publisher_name or ''
- journal.subject_areas = j.subject_areas or []
- journal.wos_subject_areas = j.wos_subject_areas or []
- logging.info(f'Journal {"created" if created else "updated"}: {journal}')
-
- journal.save()
-
- return True
diff --git a/journal/tests.py b/journal/tests.py
deleted file mode 100644
index 7ce503c..0000000
--- a/journal/tests.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from django.test import TestCase
-
-# Create your tests here.
diff --git a/journal/utils.py b/journal/utils.py
deleted file mode 100644
index 8a80521..0000000
--- a/journal/utils.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from articlemeta.client import ThriftClient, RestfulClient
-
-
-def fetch_article_meta_journals(collection='scl', mode='rest'):
- """
- Fetches article metadata from journals.
-
- Returns
- -------
- list
- A list of article metadata.
- """
- if mode == 'rest':
- am = RestfulClient()
- elif mode == 'thrift':
- am = ThriftClient()
-
- for j in am.journals(collection=collection):
- yield j
diff --git a/journal/views.py b/journal/views.py
deleted file mode 100644
index 91ea44a..0000000
--- a/journal/views.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from django.shortcuts import render
-
-# Create your views here.
diff --git a/journal/wagtail_hooks.py b/journal/wagtail_hooks.py
deleted file mode 100644
index 725b370..0000000
--- a/journal/wagtail_hooks.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from django.utils.translation import gettext_lazy as _
-from wagtail.snippets.views.snippets import SnippetViewSet
-from wagtail.snippets.models import register_snippet
-
-from config.menu import get_menu_order
-
-from .models import Journal
-
-
-class JournalSnippetViewSet(SnippetViewSet):
- model = Journal
- icon = "folder-open-inverse"
- menu_name = "journal"
- menu_label = _("Journal")
- menu_order = get_menu_order('journal')
- add_to_admin_menu = True
-
- list_display = (
- "collection",
- "scielo_issn",
- "acronym",
- "title",
- "issns",
- "publisher_name",
- "subject_areas",
- "wos_subject_areas",
- )
- list_filter = (
- "collection",
- )
- search_fields = (
- "issns",
- "acronym",
- "publisher_name",
- "subject_areas",
- "wos_subject_areas",
- )
-
-
-register_snippet(JournalSnippetViewSet)
diff --git a/local.yml b/local.yml
index 3c25357..9b3a047 100644
--- a/local.yml
+++ b/local.yml
@@ -11,10 +11,15 @@ services:
- mailhog
volumes:
- .:/app:z
- - ../scms_data/scielo_usage/data/logs:/app/logs
+ - /mnt/pidata2/pi/scl/logs:/app/logs
+ # Uncomment to use local SciELO lib repos for development:
+ # - ../scielo_log_validator:/app/scielo_log_validator:z
+ # - ../scielo_usage_counter:/app/scielo_usage_counter:z
env_file:
- ./.envs/.local/.django
- ./.envs/.local/.postgres
+ environment:
+ - USE_LOCAL_SCIELO_LIBS=0
ports:
- "8009:8000"
command: /start
@@ -40,7 +45,7 @@ services:
- "8029:8025"
redis:
- image: redis:6
+ image: redis:8
container_name: scielo_usage_local_redis
ports:
- "6399:6379"
diff --git a/log_manager/choices.py b/log_manager/choices.py
index e98c8f2..c6e461a 100644
--- a/log_manager/choices.py
+++ b/log_manager/choices.py
@@ -19,13 +19,3 @@
(LOG_FILE_STATUS_IGNORED, _("Ignored")),
]
-
-COLLECTION_LOG_FILE_DATE_COUNT_OK = 'OK'
-COLLECTION_LOG_FILE_DATE_COUNT_MISSING_FILES = 'MIS'
-COLLECTION_LOG_FILE_DATE_COUNT_EXTRA_FILES = 'EXT'
-
-COLLECTION_LOG_FILE_DATE_COUNT = [
- (COLLECTION_LOG_FILE_DATE_COUNT_OK, _("OK")),
- (COLLECTION_LOG_FILE_DATE_COUNT_MISSING_FILES, _("Missing Files")),
- (COLLECTION_LOG_FILE_DATE_COUNT_EXTRA_FILES, _("Extra files")),
-]
diff --git a/log_manager/management/__init__.py b/log_manager/management/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/log_manager/management/__init__.py
@@ -0,0 +1 @@
+
diff --git a/log_manager/management/commands/__init__.py b/log_manager/management/commands/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/log_manager/management/commands/__init__.py
@@ -0,0 +1 @@
+
diff --git a/log_manager/management/commands/reset_log_catalog.py b/log_manager/management/commands/reset_log_catalog.py
new file mode 100644
index 0000000..5ded576
--- /dev/null
+++ b/log_manager/management/commands/reset_log_catalog.py
@@ -0,0 +1,94 @@
+from django.core.management.base import BaseCommand
+from django.db import transaction
+
+from log_manager.models import LogFile
+from metrics.models import DailyMetricJob
+from metrics.services import daily_payloads
+from reports.models import MonthlyLogReport, WeeklyLogReport, YearlyLogReport
+from tracker.models import LogFileDiscardedLine
+
+
+class Command(BaseCommand):
+ help = (
+ "Clear the log catalog stored in the database, including derived parsing "
+ "records, daily metric payloads, and optionally reports, "
+ "while preserving the source log files on disk."
+ )
+
+ def add_arguments(self, parser):
+ parser.add_argument(
+ "--collection",
+ action="append",
+ dest="collections",
+ help="Collection acronym to limit cleanup. Repeat the option for multiple collections.",
+ )
+ parser.add_argument(
+ "--reports",
+ action="store_true",
+ default=False,
+ help="Also clear Weekly/Monthly/Yearly log reports for the selected collections.",
+ )
+
+ def handle(self, *args, **options):
+ collections = options.get("collections") or []
+ clear_reports = options.get("reports")
+
+ log_files = LogFile.objects.all()
+ if collections:
+ log_files = log_files.filter(collection__acron3__in=collections)
+
+ log_file_ids = list(log_files.values_list("id", flat=True))
+ if not log_file_ids:
+ self.stdout.write(self.style.WARNING("No log catalog rows found for cleanup."))
+ return
+
+ daily_jobs = DailyMetricJob.objects.all()
+ if collections:
+ daily_jobs = daily_jobs.filter(collection__acron3__in=collections)
+ payload_paths = list(daily_jobs.exclude(storage_path="").values_list("storage_path", flat=True))
+
+ summary = {
+ "log_files": len(log_file_ids),
+ "discarded_lines": LogFileDiscardedLine.objects.filter(
+ log_file_id__in=log_file_ids
+ ).count(),
+ "daily_metric_jobs": daily_jobs.count(),
+ }
+
+ for storage_path in payload_paths:
+ daily_payloads.delete_payload(storage_path)
+
+ with transaction.atomic():
+ LogFileDiscardedLine.objects.filter(log_file_id__in=log_file_ids).delete()
+ daily_jobs.delete()
+ LogFile.objects.filter(id__in=log_file_ids).delete()
+
+ if clear_reports:
+ report_qs = WeeklyLogReport.objects.all()
+ m_qs = MonthlyLogReport.objects.all()
+ y_qs = YearlyLogReport.objects.all()
+ if collections:
+ report_qs = report_qs.filter(collection__acron3__in=collections)
+ m_qs = m_qs.filter(collection__acron3__in=collections)
+ y_qs = y_qs.filter(collection__acron3__in=collections)
+ summary["weekly_reports"] = report_qs.count()
+ summary["monthly_reports"] = m_qs.count()
+ summary["yearly_reports"] = y_qs.count()
+ report_qs.delete()
+ m_qs.delete()
+ y_qs.delete()
+
+ msg = (
+ f"Cleared log catalog: "
+ f"{summary['log_files']} log files, "
+ f"{summary['discarded_lines']} discarded lines, "
+ f"{summary['daily_metric_jobs']} daily metric jobs."
+ )
+ if clear_reports:
+ msg += (
+ f" Also cleared reports: "
+ f"{summary['weekly_reports']} weekly, "
+ f"{summary['monthly_reports']} monthly, "
+ f"{summary['yearly_reports']} yearly."
+ )
+ self.stdout.write(self.style.SUCCESS(msg))
diff --git a/log_manager/migrations/0010_alter_logfiledate_unique_together_and_more.py b/log_manager/migrations/0010_alter_logfiledate_unique_together_and_more.py
new file mode 100644
index 0000000..d30cdf4
--- /dev/null
+++ b/log_manager/migrations/0010_alter_logfiledate_unique_together_and_more.py
@@ -0,0 +1,52 @@
+# Generated by Django 5.2.12 on 2026-05-01 22:23
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("log_manager", "0009_collectionlogfiledatecount_exported_files_count"),
+ ]
+
+ operations = [
+ migrations.RemoveField(
+ model_name="logfiledate",
+ name="creator",
+ ),
+ migrations.RemoveField(
+ model_name="logfiledate",
+ name="log_file",
+ ),
+ migrations.RemoveField(
+ model_name="logfiledate",
+ name="updated_by",
+ ),
+ migrations.RemoveField(
+ model_name="logfile",
+ name="creator",
+ ),
+ migrations.RemoveField(
+ model_name="logfile",
+ name="updated_by",
+ ),
+ migrations.AddField(
+ model_name="logfile",
+ name="date",
+ field=models.DateField(
+ blank=True, db_index=True, null=True, verbose_name="Date"
+ ),
+ ),
+ migrations.AddField(
+ model_name="logfile",
+ name="parse_heartbeat_at",
+ field=models.DateTimeField(
+ blank=True, null=True, verbose_name="Parse Heartbeat At"
+ ),
+ ),
+ migrations.DeleteModel(
+ name="CollectionLogFileDateCount",
+ ),
+ migrations.DeleteModel(
+ name="LogFileDate",
+ ),
+ ]
diff --git a/log_manager/models.py b/log_manager/models.py
index fc3a8b6..6bf04d8 100644
--- a/log_manager/models.py
+++ b/log_manager/models.py
@@ -1,209 +1,20 @@
import logging
-from django.db import models
-from django.db.models import Q
+from django.db import IntegrityError, models
from django.utils import timezone
from django.utils.translation import gettext_lazy as _
from wagtail.admin.panels import FieldPanel
from wagtailautocomplete.edit_handlers import AutocompletePanel
from collection.models import Collection
-from core.forms import CoreAdminModelForm
-from core.models import CommonControlField
from . import choices
-class LogFileDate(CommonControlField):
- date = models.DateField(
- verbose_name=_("Date"),
- null=False,
- blank=False,
- db_index=True,
- )
-
- log_file = models.ForeignKey(
- 'LogFile',
- verbose_name=_('Log File'),
- blank=True,
- on_delete=models.DO_NOTHING,
- db_index=True,
- )
-
- base_form_class = CoreAdminModelForm
-
- panel = [
- FieldPanel('date'),
- AutocompletePanel('log_file')
- ]
-
- class Meta:
- ordering = ['-date']
- verbose_name = _("Log File Date")
- verbose_name_plural = _("Log File Dates")
- unique_together = (
- 'date',
- 'log_file',
- )
- indexes = [
- models.Index(fields=['date', 'log_file']),
- ]
-
- @classmethod
- def create_or_update(cls, user, log_file, date):
- obj, created = cls.objects.get_or_create(
- log_file=log_file,
- date=date,
- )
-
- if not created:
- obj.updated_by = user
- obj.updated = timezone.now()
- else:
- obj.creator = user
- obj.created = timezone.now()
-
- return obj
-
- @classmethod
- def filter_by_collection_and_date(cls, collection, date):
- return cls.objects.filter(
- ~Q(log_file__status__in=[
- choices.LOG_FILE_STATUS_CREATED,
- choices.LOG_FILE_STATUS_INVALIDATED
- ]),
- log_file__collection__acron3=collection,
- date=date,
- )
-
- @classmethod
- def get_number_of_found_files_for_date(cls, collection, date):
- return cls.objects.filter(
- ~Q(log_file__status__in=[
- choices.LOG_FILE_STATUS_CREATED,
- choices.LOG_FILE_STATUS_INVALIDATED
- ]),
- log_file__collection__acron3=collection,
- date=date,
- ).count()
-
- def __str__(self):
- return f'{self.log_file.path}-{self.date}'
-
-
-class CollectionLogFileDateCount(CommonControlField):
- collection = models.ForeignKey(
- Collection,
- verbose_name=_('Collection'),
- on_delete=models.DO_NOTHING,
- null=False,
- blank=False,
- )
-
- date = models.DateField(
- _('Date'),
- null=False,
- blank=False,
- )
-
- year = models.IntegerField(
- _('Year'),
- null=False,
- blank=False,
- )
-
- month = models.IntegerField(
- _('Month'),
- null=False,
- blank=False,
- )
-
- found_log_files = models.IntegerField(
- verbose_name=_('Number of Found Valid Log Files'),
- default=0,
- )
-
- expected_log_files = models.IntegerField(
- verbose_name=_('Number of Expected Valid Log Files'),
- blank=True,
- null=True,
- )
-
- is_usage_metric_computed = models.BooleanField(
- verbose_name=_('Is Usage Metric Computed'),
- default=False,
- )
-
- exported_files_count = models.SmallIntegerField(
- verbose_name=_('Exported Files Count'),
- default=0,
- )
-
- status = models.CharField(
- verbose_name=_('Status'),
- choices=choices.COLLECTION_LOG_FILE_DATE_COUNT,
- max_length=3,
- )
-
- def set_status(self):
- if self.found_log_files < self.expected_log_files:
- self.status = choices.COLLECTION_LOG_FILE_DATE_COUNT_MISSING_FILES
- elif self.found_log_files > self.expected_log_files:
- self.status = choices.COLLECTION_LOG_FILE_DATE_COUNT_EXTRA_FILES
- else:
- self.status = choices.COLLECTION_LOG_FILE_DATE_COUNT_OK
-
- def set_is_usage_metric_computed(self):
- if self.exported_files_count == self.found_log_files:
- self.is_usage_metric_computed = True
-
- @classmethod
- def create_or_update(cls, user, collection, date, expected_log_files, found_log_files):
- obj, created = cls.objects.get_or_create(
- collection=collection,
- date=date,
- month=date.month,
- year=date.year,
- )
-
- if not created:
- obj.updated_by = user
- obj.updated = timezone.now()
- else:
- obj.creator = user
- obj.created = timezone.now()
-
- obj.expected_log_files = expected_log_files
- obj.found_log_files = found_log_files
- obj.set_status()
-
- obj.save()
- return obj
-
- class Meta:
- ordering = ['-date']
- verbose_name = _("Collection Log File Date Count")
- unique_together = (
- 'collection',
- 'date',
- )
-
- panels = [
- AutocompletePanel('collection'),
- FieldPanel('date'),
- FieldPanel('year'),
- FieldPanel('month'),
- FieldPanel('found_log_files'),
- FieldPanel('expected_log_files'),
- FieldPanel('status'),
- FieldPanel('is_usage_metric_computed'),
- ]
-
- def __str__(self):
- return f'{self.collection.acron3}-{self.date}'
-
-
-class LogFile(CommonControlField):
+class LogFile(models.Model):
+ created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True)
+ updated = models.DateTimeField(verbose_name=_("Last update date"), auto_now=True)
+ date = models.DateField(verbose_name=_("Date"), null=True, blank=True, db_index=True)
hash = models.CharField(_("Hash MD5"), max_length=32, null=True, blank=True, unique=True)
path = models.CharField(_("Name"), max_length=255, null=False, blank=False)
@@ -246,19 +57,25 @@ class LogFile(CommonControlField):
default=0,
)
+ parse_heartbeat_at = models.DateTimeField(
+ _("Parse Heartbeat At"),
+ null=True,
+ blank=True,
+ )
+
panels = [
FieldPanel('hash'),
+ FieldPanel('date'),
FieldPanel('path'),
FieldPanel('stat_result'),
FieldPanel('status'),
FieldPanel('validation'),
FieldPanel('summary'),
FieldPanel('last_processed_line'),
+ FieldPanel('parse_heartbeat_at'),
AutocompletePanel('collection'),
]
- base_form_class = CoreAdminModelForm
-
class Meta:
verbose_name = _("Log File")
verbose_name_plural = _("Log Files")
@@ -268,25 +85,28 @@ def get(cls, hash):
return cls.objects.get(hash=hash)
@classmethod
- def create_or_update(cls, user, collection, path, stat_result, hash, status=None):
+ def create_or_update(cls, collection, path, stat_result, hash, status=None):
try:
+ obj, created = cls.objects.get_or_create(
+ hash=hash,
+ defaults={
+ "collection": collection,
+ "path": path,
+ "stat_result": stat_result,
+ "status": status or choices.LOG_FILE_STATUS_CREATED,
+ },
+ )
+ except IntegrityError:
obj = cls.get(hash=hash)
- obj.updated_by = user
+ created = False
+
+ if created:
+ logging.info(f'File {path} added to the database.')
+ else:
obj.updated = timezone.now()
+ obj.save(update_fields=["updated"])
logging.info(f'File {path} already exists in the database.')
- except cls.DoesNotExist:
- obj = cls()
- obj.creator = user
- obj.created = timezone.now()
- obj.collection = collection
- obj.path = path
- obj.stat_result = stat_result
- obj.hash = hash
- obj.status = status or choices.LOG_FILE_STATUS_CREATED
- logging.info(f'File {path} added to the database.')
-
- obj.save()
return obj
def __str__(self):
diff --git a/log_manager/tasks.py b/log_manager/tasks.py
index e14fe92..10148b1 100644
--- a/log_manager/tasks.py
+++ b/log_manager/tasks.py
@@ -1,20 +1,17 @@
import logging
-import json
import os
from django.conf import settings
-from django.core.mail import send_mail
from django.contrib.auth import get_user_model
from django.utils.translation import gettext as _
from core.utils import date_utils
-from core.utils.utils import _get_user
+from core.utils.request_utils import _get_user
from config import celery_app
from collection.models import Collection
-from log_manager_config import exceptions as lmc_exceptions, models as lmc_models
+from log_manager_config import models as lmc_models
from . import (
- exceptions,
choices,
models,
utils,
@@ -26,8 +23,8 @@
User = get_user_model()
-@celery_app.task(bind=True, name=_('Search for log files'), queue='load')
-def task_search_log_files(self, collections=[], from_date=None, until_date=None, days_to_go_back=None, user_id=None, username=None):
+@celery_app.task(bind=True, name=_('[Log Pipeline] 1. Search Logs (Manual)'), queue='load')
+def task_search_log_files(self, collections=[], from_date=None, until_date=None, days_to_go_back=None, user_id=None, username=None, trigger_validation=False):
"""
Task to search for log files in the directories defined in the CollectionLogDirectory model.
@@ -44,11 +41,11 @@ def task_search_log_files(self, collections=[], from_date=None, until_date=None,
for col in collections or Collection.acron3_list():
collection = Collection.objects.get(acron3=col)
- col_configs_dirs = lmc_models.CollectionLogDirectory.objects.filter(collection__acron3=col, active=True)
+ col_configs_dirs = lmc_models.CollectionLogDirectory.objects.filter(config__collection__acron3=col, active=True)
if len(col_configs_dirs) == 0:
logging.error(f'No CollectionLogDirectory found for collection {col}.')
- supported_logfile_extensions = lmc_models.SupportedLogFile.objects.values_list('file_extension', flat=True)
+ supported_logfile_extensions = settings.SUPPORTED_LOGFILE_EXTENSIONS
if len(supported_logfile_extensions) == 0:
logging.error('No SupportedLogFile found. Please, add a SupportedLogFile for each of the supported log file formats.')
@@ -62,7 +59,18 @@ def task_search_log_files(self, collections=[], from_date=None, until_date=None,
visible_dates = _get_visible_dates(from_date, until_date, days_to_go_back)
logging.debug(f'Visible dates: {visible_dates}')
- _add_log_file(user, collection, root, name, visible_dates)
+ _add_log_file(collection, root, name, visible_dates)
+
+ if trigger_validation:
+ task_validate_log_files.apply_async(kwargs={
+ "collections": collections,
+ "from_date": from_date,
+ "until_date": until_date,
+ "days_to_go_back": days_to_go_back,
+ "user_id": user_id,
+ "username": username,
+ "trigger_parse": True
+ })
def _get_visible_dates(from_date, until_date, days_to_go_back):
@@ -70,14 +78,13 @@ def _get_visible_dates(from_date, until_date, days_to_go_back):
return date_utils.get_date_objs_from_date_range(from_date_str, until_date_str)
-def _add_log_file(user, collection, root, name, visible_dates):
+def _add_log_file(collection, root, name, visible_dates):
file_path = os.path.join(root, name)
file_ctime = date_utils.get_date_obj_from_timestamp(os.stat(file_path).st_ctime)
logging.debug(f'Checking file {file_path} with ctime {file_ctime}.')
if file_ctime in visible_dates:
models.LogFile.create_or_update(
- user=user,
collection=collection,
path=file_path,
stat_result=os.stat(file_path),
@@ -85,8 +92,8 @@ def _add_log_file(user, collection, root, name, visible_dates):
)
-@celery_app.task(bind=True, name=_('Validate log files'), timelimit=-1, queue='load')
-def task_validate_log_files(self, collections=[], from_date=None, until_date=None, days_to_go_back=None, user_id=None, username=None, ignore_date=False):
+@celery_app.task(bind=True, name=_('[Log Pipeline] 2. Validate Logs (Manual)'), timelimit=-1, queue='load')
+def task_validate_log_files(self, collections=[], from_date=None, until_date=None, days_to_go_back=None, user_id=None, username=None, ignore_date=False, trigger_parse=False, revalidate=False, status_list=None):
"""
Task to validate log files in the database.
@@ -98,22 +105,58 @@ def task_validate_log_files(self, collections=[], from_date=None, until_date=Non
user_id (int, optional): The ID of the user initiating the task. Defaults to None.
username (str, optional): The username of the user initiating the task. Defaults to None.
ignore_date (bool, optional): If True, ignore the date of the log file. Defaults to False.
+ revalidate (bool, optional): If True, also revalidate files in statuses from status_list. Defaults to False.
+ status_list (list, optional): List of status codes to revalidate when revalidate=True. Defaults to [QUE, INV, ERR].
"""
cols = collections or Collection.acron3_list()
logging.info(f'Validating log files for collections: {cols}.')
visible_dates = _get_visible_dates(from_date, until_date, days_to_go_back)
if not ignore_date:
+ if not visible_dates:
+ logging.warning("No visible dates found for log validation.")
+ return
logging.info(f'Interval: {visible_dates[0]} to {visible_dates[-1]}.')
+ status_filter = [choices.LOG_FILE_STATUS_CREATED]
+ if revalidate:
+ status_filter += status_list or [choices.LOG_FILE_STATUS_QUEUED, choices.LOG_FILE_STATUS_INVALIDATED, choices.LOG_FILE_STATUS_ERROR]
+
+ tasks = []
for col in cols:
- for log_file in models.LogFile.objects.filter(status=choices.LOG_FILE_STATUS_CREATED, collection__acron3=col):
+ for log_file in models.LogFile.objects.filter(status__in=status_filter, collection__acron3=col):
file_ctime = date_utils.get_date_obj_from_timestamp(log_file.stat_result[LOGFILE_STAT_RESULT_CTIME_INDEX])
if file_ctime in visible_dates or ignore_date:
- task_validate_log_file.apply_async(args=(log_file.hash, user_id, username))
-
-
-@celery_app.task(bind=True, name=_('Validate log file'), timelimit=-1, queue='load')
+ tasks.append(task_validate_log_file.s(log_file.hash, user_id, username))
+
+ if tasks:
+ if trigger_parse:
+ from celery import chord
+ from metrics.tasks import task_parse_logs
+ chord(tasks)(task_parse_logs.si(
+ collections=collections,
+ from_date=from_date,
+ until_date=until_date,
+ days_to_go_back=days_to_go_back,
+ user_id=user_id,
+ username=username,
+ ))
+ else:
+ for task in tasks:
+ task.apply_async()
+ elif trigger_parse:
+ from metrics.tasks import task_parse_logs
+ task_parse_logs.apply_async(kwargs={
+ "collections": collections,
+ "from_date": from_date,
+ "until_date": until_date,
+ "days_to_go_back": days_to_go_back,
+ "user_id": user_id,
+ "username": username,
+ })
+
+
+@celery_app.task(bind=True, name=_('[Log Pipeline] Validate Single Log File (Auto)'), timelimit=-1, queue='load')
def task_validate_log_file(self, log_file_hash, user_id=None, username=None):
"""
Task to validate a specific log file.
@@ -135,21 +178,21 @@ def task_validate_log_file(self, log_file_hash, user_id=None, username=None):
del val_result['content']['summary']['datetimes']
if 'probably_date' in val_result:
- val_result['probably_date'] = date_utils.get_date_str(val_result['probably_date'])
-
- try:
- log_file.validation = val_result
- log_file.validation.update({'buffer_size': buffer_size, 'sample_size': sample_size})
- except json.JSONDecodeError as e:
- logging.error(f'Error serializing validation result: {e}')
- log_file.validation = {}
+ if isinstance(val_result['probably_date'], dict):
+ logging.error(f"Error determining probably_date: {val_result['probably_date'].get('error')}")
+ val_result['probably_date'] = None
+ else:
+ try:
+ val_result['probably_date'] = date_utils.get_date_str(val_result['probably_date'])
+ except (ValueError, AttributeError) as e:
+ logging.error(f'Error serializing probably_date: {e}')
+ val_result['probably_date'] = None
+
+ log_file.validation = val_result
+ log_file.validation.update({'buffer_size': buffer_size, 'sample_size': sample_size})
if val_result.get('is_valid', {}).get('all', False):
- models.LogFileDate.create_or_update(
- user=user,
- log_file=log_file,
- date=val_result.get('probably_date', ''),
- )
+ log_file.date = val_result.get('probably_date') or None
log_file.status = choices.LOG_FILE_STATUS_QUEUED
else:
@@ -160,116 +203,19 @@ def task_validate_log_file(self, log_file_hash, user_id=None, username=None):
def _fetch_validation_parameters(collection, default_buffer_size=0.1, default_sample_size=2048):
- col_configs_params = lmc_models.CollectionValidationParameters.objects.filter(collection__acron3=collection).first()
- if not col_configs_params:
- logging.warning(f'No CollectionValidationParameters found for collection {collection}. Using default values.')
+ col_configs = lmc_models.LogManagerCollectionConfig.objects.filter(collection__acron3=collection).first()
+ if not col_configs:
+ logging.warning(f'No LogManagerCollectionConfig found for collection {collection}. Using default values.')
return default_buffer_size, default_sample_size
- return col_configs_params.buffer_size, col_configs_params.sample_size
+ return col_configs.buffer_size, col_configs.sample_size
-@celery_app.task(bind=True, name=_('Check missing log files'))
-def task_check_missing_logs_for_date_range(self, collections=[], from_date=None, until_date=None, days_to_go_back=None, user_id=None, username=None):
+@celery_app.task(bind=True, name=_('[Log Pipeline] Daily Routine (Auto)'), queue='load')
+def task_daily_log_ingestion_pipeline(self):
"""
- Task to check for missing log files in the defined date range.
-
- Parameters:
- collections (list, optional): List of collection acronyms. Defaults to [].
- from_date (str, optional): The start date for log discovery in YYYY-MM-DD format. Defaults to None.
- until_date (str, optional): The end date for log discovery in YYYY-MM-DD format. Defaults to None.
- days_to_go_back (int, optional): The number of days to go back from today for log discovery. Defaults to None.
- user_id (int, optional): The ID of the user initiating the task. Defaults to None.
- username (str, optional): The username of the user initiating the task. Defaults to None.
-
- Raises:
- exceptions.UndefinedCollectionFilesPerDayError: Raised when there are no expected log files for the collection.
- exceptions.MultipleFilesPerDayForTheSameDateError: Raised when there are multiple expected log files for the same date.
+ Facade task for the daily log ingestion pipeline.
+ It initiates the Search -> Validate -> Parse chain using default parameters.
+ No arguments are required, making it easy to schedule periodically.
"""
- user = _get_user(self.request, username=username, user_id=user_id)
-
- from_date_str, until_date_str = date_utils.get_date_range_str(from_date, until_date, days_to_go_back)
-
- for col in collections or Collection.acron3_list():
- collection = Collection.objects.get(acron3=col)
- for date in date_utils.get_date_objs_from_date_range(from_date_str, until_date_str):
- logging.info(f'Couting logs for collection {col} and date {date}')
- count_logs_for_date(user, collection, date)
-
-
-def count_logs_for_date(user, collection, date):
- try:
- n_expected_files = lmc_models.CollectionLogFilesPerDay.get_number_of_expected_files_by_day(collection=collection.acron3, date=date)
- except lmc_exceptions.UndefinedCollectionFilesPerDayError:
- return
- except lmc_exceptions.MultipleFilesPerDayForTheSameDateError:
- return
-
- n_found_logs = models.LogFileDate.get_number_of_found_files_for_date(collection=collection.acron3, date=date)
-
- obj = models.CollectionLogFileDateCount.create_or_update(
- user=user,
- collection=collection,
- date=date,
- expected_log_files=n_expected_files,
- found_log_files=n_found_logs,
- )
- logging.info(f'Created CollectionLogFileDateCount object {obj}.')
-
-
-@celery_app.task(bind=True, name=_('Generate log files count report'))
-def task_log_files_count_status_report(self, collections=[], from_date=None, until_date=None, days_to_go_back=None, user_id=None, username=None):
- from_date, until_date = date_utils.get_date_range_str(from_date, until_date, days_to_go_back)
-
- from_date_obj = date_utils.get_date_obj(from_date)
- until_date_obj = date_utils.get_date_obj(until_date)
-
- for collection in collections or Collection.acron3_list():
- col = models.Collection.objects.get(acron3=collection)
- subject = _(f'Usage Log Validation Results ({from_date} to {until_date})')
- message = _(f'This message provides the results of the Usage Log Validation for the period {from_date} to {until_date}:\n\n')
-
- missing = models.CollectionLogFileDateCount.objects.filter(
- collection__acron3=collection,
- status=choices.COLLECTION_LOG_FILE_DATE_COUNT_MISSING_FILES,
- date__gte=from_date_obj,
- date__lte=until_date_obj,
- )
- extra = models.CollectionLogFileDateCount.objects.filter(
- collection__acron3=collection,
- status=choices.COLLECTION_LOG_FILE_DATE_COUNT_EXTRA_FILES,
- date__gte=from_date_obj,
- date__lte=until_date_obj,
- )
- ok = models.CollectionLogFileDateCount.objects.filter(
- collection__acron3=collection,
- status=choices.COLLECTION_LOG_FILE_DATE_COUNT_OK,
- date__gte=from_date_obj,
- date__lte=until_date_obj,
- )
-
- if missing.count() > 0:
- message += _(f'- There are {missing.count()} missing log files.\n')
- if extra.count() > 0:
- message += _(f'- There are {extra.count()} extra log files.\n')
- if ok.count() > 0:
- message += _(f'- There are {ok.count()} dates with correct log files.\n')
-
- if missing.count() > 0 or extra.count() > 0:
- message += _(f'\nPlease review the script responsible for sharing the log files.\n')
-
- message += _(f'\nYou can view the full report at {settings.WAGTAILADMIN_BASE_URL}/admin/snippets/log_manager/collectionlogfiledatecount/?collection={col.pk}>.')
-
- logging.info(f'Sending email to collection {col.main_name}. Subject: {subject}. Message: {message}')
- _send_message(subject, message, collection)
-
-
-def _send_message(subject, message, collection):
- collection_emails = lmc_models.CollectionEmail.objects.filter(collection__acron3=collection, active=True).values_list('email', flat=True)
- if len(collection_emails) == 0:
- raise exceptions.UndefinedCollectionConfigError(_("Error. Please, add an E-mail Configuration for the collection."))
-
- send_mail(
- subject=subject,
- message=message,
- from_email=settings.EMAIL_HOST_USER,
- recipient_list=collection_emails
- )
+ logging.info("Starting Daily Log Ingestion Pipeline")
+ task_search_log_files.apply_async(kwargs={"trigger_validation": True})
diff --git a/log_manager/tests.py b/log_manager/tests.py
index 7ce503c..51c1402 100644
--- a/log_manager/tests.py
+++ b/log_manager/tests.py
@@ -1,3 +1,58 @@
+from unittest.mock import patch
+
+from django.db import IntegrityError
from django.test import TestCase
-# Create your tests here.
+from collection.models import Collection
+
+from . import choices, tasks
+from .models import LogFile
+
+
+class LogFileTests(TestCase):
+ def setUp(self):
+ self.collection = Collection.objects.create(acron3="books", acron2="bk")
+
+ def test_create_or_update_creates_log_file(self):
+ log_file = LogFile.create_or_update(
+ collection=self.collection,
+ path="/tmp/new.log.gz",
+ stat_result={"size": 10},
+ hash="1" * 32,
+ )
+
+ self.assertEqual(log_file.collection, self.collection)
+ self.assertEqual(log_file.path, "/tmp/new.log.gz")
+ self.assertEqual(log_file.status, choices.LOG_FILE_STATUS_CREATED)
+
+ def test_create_or_update_refetches_existing_log_after_integrity_error(self):
+ existing = LogFile.objects.create(
+ collection=self.collection,
+ path="/tmp/existing.log.gz",
+ stat_result={"size": 10},
+ hash="1" * 32,
+ status=choices.LOG_FILE_STATUS_CREATED,
+ )
+
+ with patch.object(LogFile.objects, "get_or_create", side_effect=IntegrityError):
+ log_file = LogFile.create_or_update(
+ collection=self.collection,
+ path="/tmp/existing.log.gz",
+ stat_result={"size": 10},
+ hash=existing.hash,
+ )
+
+ self.assertEqual(log_file.pk, existing.pk)
+
+
+class ValidateLogFilesTaskTests(TestCase):
+ def test_validate_log_files_returns_for_empty_visible_date_range(self):
+ with patch("log_manager.tasks.task_validate_log_file.s") as mocked_signature:
+ result = tasks.task_validate_log_files.run(
+ collections=["books"],
+ from_date="2024-02-02",
+ until_date="2024-02-01",
+ )
+
+ self.assertIsNone(result)
+ mocked_signature.assert_not_called()
diff --git a/log_manager/utils.py b/log_manager/utils.py
index 4a2b00b..c7dd2db 100644
--- a/log_manager/utils.py
+++ b/log_manager/utils.py
@@ -1,42 +1,47 @@
+import gzip
import hashlib
+from collections import deque
from scielo_log_validator import validator
-def hash_file(path, num_lines=25):
+def hash_file(path, num_lines=500):
"""
- Calculates the MD5 hash of a file using a combination of its first and last `num_lines` lines,
- as well as its size.
-
+ Calculates the MD5 hash of a file using a combination of its first and last
+ `num_lines` lines.
+
+ For gzip-compressed files, the content is decompressed before hashing,
+ so that different compressions of the same data produce the same hash.
+ File size is intentionally NOT included because it varies between
+ compressions and between growing log files, causing false duplicates.
+
Args:
path (str): The path to the file.
- num_lines (int): The number of lines to consider from the beginning and end of the file. Default is 25.
+ num_lines (int): The number of lines to consider from the beginning
+ and end of the file. Default is 500.
Returns:
The MD5 hash digest as a hexadecimal string.
"""
md5_hash = hashlib.md5()
- with open(path, 'rb') as file:
- # Read the first `num_lines` lines of the file
+ opener = gzip.open if _is_gzip(path) else open
+
+ with opener(path, 'rb') as file:
first_lines = b''.join([file.readline() for _ in range(num_lines)])
md5_hash.update(first_lines)
- # Move the file pointer to the end of the file
- file.seek(0, 2)
+ tail = deque(maxlen=num_lines)
+ for line in file:
+ tail.append(line)
+ md5_hash.update(b''.join(tail))
- # Get the size of the file
- size = file.tell()
- md5_hash.update(str(size).encode())
-
- # Move the file pointer to the start of the file
- file.seek(-size, 2)
+ return md5_hash.hexdigest()
- # Read the last `num_lines` lines of the file
- last_lines = file.readlines()[-num_lines:]
- md5_hash.update(b''.join(last_lines))
- return md5_hash.hexdigest()
+def _is_gzip(path):
+ with open(path, 'rb') as f:
+ return f.read(2) == b'\x1f\x8b'
def validate_file(path, sample_size=0.1, buffer_size=2048, days_delta=5, apply_path_validation=True, apply_content_validation=True):
return validator.pipeline_validate(
diff --git a/log_manager/wagtail_hooks.py b/log_manager/wagtail_hooks.py
index aeb6908..1548ad3 100644
--- a/log_manager/wagtail_hooks.py
+++ b/log_manager/wagtail_hooks.py
@@ -3,54 +3,10 @@
from wagtail.snippets.models import register_snippet
from config.menu import get_menu_order
+from log_manager_config.wagtail_hooks import LogManagerCollectionConfigSnippetViewSet
+from metrics.wagtail_hooks import DailyMetricJobSnippetViewSet
-from log_manager.models import (
- CollectionLogFileDateCount,
- LogFile,
- LogFileDate,
-)
-
-
-class LogFileDateViewSet(SnippetViewSet):
- model = LogFileDate
- menu_label = _("Log Files per Day")
- icon = "folder"
- menu_order = 300
-
- list_display = (
- "date",
- "log_file",
- )
- list_filter = (
- "date",
- "log_file__collection",
- )
- search_fields = ()
-
-
-class CollectionLogFileDateCountViewSet(SnippetViewSet):
- model = CollectionLogFileDateCount
- menu_label = _("Expected and Found Log Files")
- icon = "folder"
- menu_order = 400
-
- list_display = (
- "collection",
- "date",
- "found_log_files",
- "expected_log_files",
- "status",
- "exported_files_count",
- "is_usage_metric_computed",
- )
- list_filter = (
- "collection",
- "status",
- "exported_files_count",
- "is_usage_metric_computed",
- "year",
- "month"
- )
+from log_manager.models import LogFile
class LogFileSnippetViewSet(SnippetViewSet):
@@ -60,16 +16,17 @@ class LogFileSnippetViewSet(SnippetViewSet):
menu_order = 500
list_display = (
"path",
- "stat_result",
"collection",
"status",
+ "date",
"validation",
"summary",
"last_processed_line",
+ "parse_heartbeat_at",
"hash"
)
- list_filter = ("status", "collection")
- search_fields = ("file",)
+ list_filter = ("status", "collection", "date")
+ search_fields = ("path", "hash", "collection__acron3", "collection__main_name")
class LogSnippetViewSetGroup(SnippetViewSetGroup):
@@ -78,9 +35,9 @@ class LogSnippetViewSetGroup(SnippetViewSetGroup):
menu_icon = "folder-open-inverse"
menu_order = get_menu_order("log_manager")
items = (
- LogFileDateViewSet,
- CollectionLogFileDateCountViewSet,
+ LogManagerCollectionConfigSnippetViewSet,
LogFileSnippetViewSet,
+ DailyMetricJobSnippetViewSet,
)
diff --git a/log_manager_config/exceptions.py b/log_manager_config/exceptions.py
index ad7581a..0a6a6a9 100644
--- a/log_manager_config/exceptions.py
+++ b/log_manager_config/exceptions.py
@@ -4,11 +4,5 @@ class UndefinedCollectionLogDirectoryError(Exception):
class UndefinedCollectionEmailError(Exception):
...
-class UndefinedCollectionFilesPerDayError(Exception):
- ...
-
class UndefinedSupportedLogFile(Exception):
...
-
-class MultipleFilesPerDayForTheSameDateError(Exception):
- ...
diff --git a/log_manager_config/migrations/0004_logmanagercollectionconfig_and_more.py b/log_manager_config/migrations/0004_logmanagercollectionconfig_and_more.py
new file mode 100644
index 0000000..5b6351c
--- /dev/null
+++ b/log_manager_config/migrations/0004_logmanagercollectionconfig_and_more.py
@@ -0,0 +1,223 @@
+# Generated by Django 5.2.12 on 2026-05-01 22:27
+
+import django.db.models.deletion
+import modelcluster.fields
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("collection", "0001_initial"),
+ ("log_manager_config", "0003_alter_collectionemail_options_and_more"),
+ migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name="LogManagerCollectionConfig",
+ fields=[
+ (
+ "id",
+ models.BigAutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ (
+ "created",
+ models.DateTimeField(
+ auto_now_add=True, verbose_name="Creation date"
+ ),
+ ),
+ (
+ "updated",
+ models.DateTimeField(
+ auto_now=True, verbose_name="Last update date"
+ ),
+ ),
+ (
+ "sample_size",
+ models.FloatField(default=0.1, verbose_name="Sample Size"),
+ ),
+ (
+ "buffer_size",
+ models.IntegerField(default=2048, verbose_name="Buffer Size"),
+ ),
+ (
+ "expected_logs_per_day",
+ models.IntegerField(
+ default=1, verbose_name="Expected Logs Per Day"
+ ),
+ ),
+ ],
+ options={
+ "verbose_name": "Log Manager Collection Config",
+ "verbose_name_plural": "Log Manager Collection Configs",
+ },
+ ),
+ migrations.RemoveField(
+ model_name="collectionlogfilesperday",
+ name="collection",
+ ),
+ migrations.RemoveField(
+ model_name="collectionlogfilesperday",
+ name="creator",
+ ),
+ migrations.RemoveField(
+ model_name="collectionlogfilesperday",
+ name="updated_by",
+ ),
+ migrations.RemoveField(
+ model_name="collectionurltranslatorclass",
+ name="collection",
+ ),
+ migrations.RemoveField(
+ model_name="collectionurltranslatorclass",
+ name="creator",
+ ),
+ migrations.RemoveField(
+ model_name="collectionurltranslatorclass",
+ name="directory",
+ ),
+ migrations.RemoveField(
+ model_name="collectionurltranslatorclass",
+ name="updated_by",
+ ),
+ migrations.RemoveField(
+ model_name="collectionvalidationparameters",
+ name="collection",
+ ),
+ migrations.RemoveField(
+ model_name="collectionvalidationparameters",
+ name="creator",
+ ),
+ migrations.RemoveField(
+ model_name="collectionvalidationparameters",
+ name="updated_by",
+ ),
+ migrations.RemoveField(
+ model_name="supportedlogfile",
+ name="creator",
+ ),
+ migrations.RemoveField(
+ model_name="supportedlogfile",
+ name="updated_by",
+ ),
+ migrations.RemoveConstraint(
+ model_name="collectionemail",
+ name="unique_collection_email",
+ ),
+ migrations.RemoveConstraint(
+ model_name="collectionlogdirectory",
+ name="unique_collection_path",
+ ),
+ migrations.RemoveField(
+ model_name="collectionemail",
+ name="collection",
+ ),
+ migrations.RemoveField(
+ model_name="collectionlogdirectory",
+ name="collection",
+ ),
+ migrations.AddField(
+ model_name="collectionemail",
+ name="sort_order",
+ field=models.IntegerField(blank=True, editable=False, null=True),
+ ),
+ migrations.AddField(
+ model_name="collectionlogdirectory",
+ name="sort_order",
+ field=models.IntegerField(blank=True, editable=False, null=True),
+ ),
+ migrations.AddField(
+ model_name="collectionlogdirectory",
+ name="translator_class",
+ field=models.CharField(
+ default="URLTranslatorClassicSite", verbose_name="URL Translator Class"
+ ),
+ ),
+ migrations.AddField(
+ model_name="logmanagercollectionconfig",
+ name="collection",
+ field=models.OneToOneField(
+ on_delete=django.db.models.deletion.CASCADE,
+ related_name="log_manager_config",
+ to="collection.collection",
+ verbose_name="Collection",
+ ),
+ ),
+ migrations.AddField(
+ model_name="logmanagercollectionconfig",
+ name="creator",
+ field=models.ForeignKey(
+ editable=False,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="%(class)s_creator",
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Creator",
+ ),
+ ),
+ migrations.AddField(
+ model_name="logmanagercollectionconfig",
+ name="updated_by",
+ field=models.ForeignKey(
+ blank=True,
+ editable=False,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="%(class)s_last_mod_user",
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Updater",
+ ),
+ ),
+ migrations.AddField(
+ model_name="collectionemail",
+ name="config",
+ field=modelcluster.fields.ParentalKey(
+ blank=True,
+ null=True,
+ on_delete=django.db.models.deletion.CASCADE,
+ related_name="emails",
+ to="log_manager_config.logmanagercollectionconfig",
+ ),
+ ),
+ migrations.AddField(
+ model_name="collectionlogdirectory",
+ name="config",
+ field=modelcluster.fields.ParentalKey(
+ blank=True,
+ null=True,
+ on_delete=django.db.models.deletion.CASCADE,
+ related_name="directories",
+ to="log_manager_config.logmanagercollectionconfig",
+ ),
+ ),
+ migrations.AddConstraint(
+ model_name="collectionemail",
+ constraint=models.UniqueConstraint(
+ fields=("config", "email"), name="unique_config_email"
+ ),
+ ),
+ migrations.AddConstraint(
+ model_name="collectionlogdirectory",
+ constraint=models.UniqueConstraint(
+ fields=("config", "path"), name="unique_config_path"
+ ),
+ ),
+ migrations.DeleteModel(
+ name="CollectionLogFilesPerDay",
+ ),
+ migrations.DeleteModel(
+ name="CollectionURLTranslatorClass",
+ ),
+ migrations.DeleteModel(
+ name="CollectionValidationParameters",
+ ),
+ migrations.DeleteModel(
+ name="SupportedLogFile",
+ ),
+ ]
diff --git a/log_manager_config/models.py b/log_manager_config/models.py
index 384368e..8cf3e34 100644
--- a/log_manager_config/models.py
+++ b/log_manager_config/models.py
@@ -4,38 +4,57 @@
from django.utils import timezone
from django.utils.translation import gettext_lazy as _
+from modelcluster.models import ClusterableModel
+from modelcluster.fields import ParentalKey
+from wagtail.models import Orderable
+from wagtail.admin.panels import FieldPanel, InlinePanel
+from wagtailautocomplete.edit_handlers import AutocompletePanel
+
from collection.models import Collection
from core.models import CommonControlField
-from .exceptions import MultipleFilesPerDayForTheSameDateError, UndefinedCollectionFilesPerDayError
-class CollectionLogDirectory(CommonControlField):
- collection = models.ForeignKey(
+class LogManagerCollectionConfig(ClusterableModel, CommonControlField):
+ collection = models.OneToOneField(
Collection,
verbose_name=_('Collection'),
- on_delete=models.DO_NOTHING,
+ on_delete=models.CASCADE,
+ related_name="log_manager_config"
)
- path = models.CharField(
- verbose_name=_('Path'),
- max_length=255,
- blank=False,
+ sample_size = models.FloatField(
+ verbose_name=_('Sample Size'),
+ blank=False,
null=False,
+ default=0.1,
)
- directory_name = models.CharField(
- verbose_name=_('Directory Name'),
- max_length=255,
- blank=True,
- null=True,
+ buffer_size = models.IntegerField(
+ verbose_name=_('Buffer Size'),
+ blank=False,
+ null=False,
+ default=2048,
)
- active = models.BooleanField(
- verbose_name=_('Active'),
- default=True,
+ expected_logs_per_day = models.IntegerField(
+ verbose_name=_('Expected Logs Per Day'),
+ default=1,
)
+ panels = [
+ AutocompletePanel("collection"),
+ FieldPanel("sample_size"),
+ FieldPanel("buffer_size"),
+ FieldPanel("expected_logs_per_day"),
+ InlinePanel("directories", label=_("Directories")),
+ InlinePanel("emails", label=_("Emails")),
+ ]
+
def __str__(self):
- return f'{self.collection} - {self.path} - {self.directory_name}'
-
+ return f'{self.collection.acron3} Config'
+
+ class Meta:
+ verbose_name = _('Log Manager Collection Config')
+ verbose_name_plural = _('Log Manager Collection Configs')
+
@classmethod
def load(cls, data, user):
for item in data:
@@ -45,13 +64,12 @@ def load(cls, data, user):
logging.warning(f'Collection {item.get("acronym")} not found.')
continue
- logging.info(item)
cls.create_or_update(
user=user,
collection=collection,
- directory_name=item.get('directory_name'),
- path=item.get('path'),
- active=item.get('active', True),
+ sample_size=item.get('sample_size', 0.1),
+ buffer_size=item.get('buffer_size', 2048),
+ expected_logs_per_day=item.get('quantity', 1),
)
@classmethod
@@ -59,81 +77,66 @@ def create_or_update(
cls,
user,
collection,
- directory_name,
- path,
- active,
+ sample_size,
+ buffer_size,
+ expected_logs_per_day,
):
- try:
- obj = cls.objects.get(collection=collection, path=path)
- except cls.DoesNotExist:
- obj = cls()
+ obj, created = cls.objects.get_or_create(collection=collection)
+ if created:
obj.creator = user
obj.created = timezone.now()
- obj.collection = collection
obj.updated_by = user
obj.updated = timezone.now()
- obj.directory_name = directory_name
- obj.path = path
- obj.active = active
-
+ obj.sample_size = sample_size
+ obj.buffer_size = buffer_size
+ obj.expected_logs_per_day = expected_logs_per_day
obj.save()
- logging.info(f'{collection.acron3} - {directory_name} - {path}')
+ logging.info(f'Config for {collection.acron3} updated.')
return obj
- class Meta:
- verbose_name = _('Collection Log Directory')
- verbose_name_plural = _('Collection Log Directories')
- constraints = [
- models.UniqueConstraint(fields=['collection', 'path'], name='unique_collection_path')
- ]
-class CollectionLogFilesPerDay(CommonControlField):
- collection = models.ForeignKey(
- Collection,
- verbose_name=_('Collection'),
- on_delete=models.DO_NOTHING,
+class CollectionLogDirectory(Orderable, CommonControlField):
+ config = ParentalKey(
+ 'LogManagerCollectionConfig',
+ related_name='directories',
+ on_delete=models.CASCADE,
+ null=True,
+ blank=True,
)
- start_date = models.DateField(
- verbose_name=_('Start Date'),
- blank=False,
+ path = models.CharField(
+ verbose_name=_('Path'),
+ max_length=255,
+ blank=False,
null=False,
)
- end_date = models.DateField(
- verbose_name=_('End Date'),
+ directory_name = models.CharField(
+ verbose_name=_('Directory Name'),
+ max_length=255,
blank=True,
null=True,
)
- quantity = models.IntegerField(
- verbose_name=_('Quantity'),
- default=1,
+ active = models.BooleanField(
+ verbose_name=_('Active'),
+ default=True,
+ )
+ translator_class = models.CharField(
+ verbose_name=_('URL Translator Class'),
+ blank=False,
+ null=False,
+ default='URLTranslatorClassicSite',
)
def __str__(self):
- return f'{self.start_date} - {self.quantity}'
+ return f'{self.config.collection} - {self.path} - {self.directory_name}'
- @classmethod
- def get_number_of_expected_files_by_day(cls, collection, date):
- files_by_day = cls.objects.filter(
- models.Q(collection__acron3=collection) &
- models.Q(start_date__lte=date) &
- (models.Q(end_date__gte=date) | models.Q(end_date__isnull=True))
- )
-
- if files_by_day.count() > 1:
- raise MultipleFilesPerDayForTheSameDateError(_("ERROR. Please, set the field end_date for the collection {collection}."))
-
- if files_by_day.count() == 0:
- raise UndefinedCollectionFilesPerDayError(_("ERROR. Please, set the number of files per day for the collection {collection}."))
-
- return int(files_by_day.get().quantity)
-
@classmethod
def load(cls, data, user):
for item in data:
try:
collection = Collection.objects.get(acron3=item.get('acronym'))
+ config, _ = LogManagerCollectionConfig.objects.get_or_create(collection=collection)
except Collection.DoesNotExist:
logging.warning(f'Collection {item.get("acronym")} not found.')
continue
@@ -141,52 +144,55 @@ def load(cls, data, user):
logging.info(item)
cls.create_or_update(
user=user,
- collection=collection,
- start_date=item.get('start_date'),
- quantity=item.get('quantity'),
- end_date=item.get('end_date'),
+ config=config,
+ directory_name=item.get('directory_name'),
+ path=item.get('path'),
+ active=item.get('active', True),
)
@classmethod
def create_or_update(
cls,
user,
- collection,
- start_date,
- quantity,
- end_date,
+ config,
+ directory_name,
+ path,
+ active,
):
try:
- obj = cls.objects.get(collection=collection, start_date=start_date)
+ obj = cls.objects.get(config=config, path=path)
except cls.DoesNotExist:
obj = cls()
obj.creator = user
obj.created = timezone.now()
- obj.collection = collection
-
+ obj.config = config
+
obj.updated_by = user
obj.updated = timezone.now()
- obj.start_date = start_date
- obj.quantity = quantity
- obj.end_date = end_date
-
+ obj.directory_name = directory_name
+ obj.path = path
+ obj.active = active
+
obj.save()
- logging.info(f'{collection.acron3} - {start_date} - {quantity}')
+ logging.info(f'{config.collection.acron3} - {directory_name} - {path}')
return obj
class Meta:
- verbose_name = _('Collection Log Files Per Day')
- verbose_name_plural = _('Collection Log Files Per Day')
+ verbose_name = _('Collection Log Directory')
+ verbose_name_plural = _('Collection Log Directories')
constraints = [
- models.UniqueConstraint(fields=['collection', 'start_date'], name='unique_collection_start_date')
+ models.UniqueConstraint(fields=['config', 'path'], name='unique_config_path')
]
-class CollectionEmail(CommonControlField):
- collection = models.ForeignKey(
- Collection,
- verbose_name=_('Collection'),
- on_delete=models.DO_NOTHING,
+
+class CollectionEmail(Orderable, CommonControlField):
+ config = ParentalKey(
+ 'LogManagerCollectionConfig',
+ related_name='emails',
+ on_delete=models.CASCADE,
+ null=True,
+ blank=True,
)
name = models.CharField(
verbose_name=_('Name'),
@@ -218,6 +224,7 @@ def load(cls, data, user):
for item in data:
try:
collection = Collection.objects.get(acron3=item.get('acronym'))
+ config, _ = LogManagerCollectionConfig.objects.get_or_create(collection=collection)
except Collection.DoesNotExist:
logging.warning(f'Collection {item.get("acronym")} not found.')
continue
@@ -225,7 +232,7 @@ def load(cls, data, user):
logging.info(item)
cls.create_or_update(
user=user,
- collection=collection,
+ config=config,
email=item.get('e-mail'),
name=item.get('name'),
position=item.get('position'),
@@ -236,19 +243,19 @@ def load(cls, data, user):
def create_or_update(
cls,
user,
- collection,
+ config,
email,
name,
position,
active,
):
try:
- obj = cls.objects.get(collection=collection, email=email)
+ obj = cls.objects.get(config=config, email=email)
except cls.DoesNotExist:
obj = cls()
obj.creator = user
obj.created = timezone.now()
- obj.collection = collection
+ obj.config = config
obj.email = email
obj.updated_by = user
@@ -258,213 +265,14 @@ def create_or_update(
obj.active = active
obj.save()
- logging.info(f'{collection.acron3} - {name} - {position} - {email}')
+ logging.info(f'{config.collection.acron3} - {name} - {position} - {email}')
return obj
class Meta:
verbose_name = _('Collection Email')
verbose_name_plural = _('Collection Emails')
constraints = [
- models.UniqueConstraint(fields=['collection', 'email'], name='unique_collection_email')
- ]
-
-
-class CollectionValidationParameters(CommonControlField):
- collection = models.ForeignKey(
- Collection,
- verbose_name=_('Collection'),
- on_delete=models.DO_NOTHING,
- primary_key=True,
- )
- sample_size = models.FloatField(
- verbose_name=_('Sample Size'),
- blank=False,
- null=False,
- default=0.1,
- )
- buffer_size = models.IntegerField(
- verbose_name=_('Buffer Size'),
- blank=False,
- null=False,
- default=2048,
- )
-
- def __str__(self):
- return f'{self.collection.acron3} - {self.sample_size} - {self.buffer_size}'
-
- @classmethod
- def load(cls, data, user):
- for item in data:
- try:
- collection = Collection.objects.get(acron3=item.get('acronym'))
- except Collection.DoesNotExist:
- logging.warning(f'Collection {item.get("acronym")} not found.')
- continue
-
- logging.info(item)
- cls.create_or_update(
- user=user,
- collection=collection,
- sample_size=item.get('sample_size'),
- buffer_size=item.get('buffer_size'),
- )
-
- @classmethod
- def create_or_update(
- cls,
- user,
- collection,
- sample_size,
- buffer_size,
- ):
- try:
- obj = cls.objects.get(collection=collection)
- except cls.DoesNotExist:
- obj = cls()
- obj.creator = user
- obj.created = timezone.now()
- obj.collection = collection
-
- obj.updated_by = user
- obj.updated = timezone.now()
- obj.sample_size = sample_size
- obj.buffer_size = buffer_size
-
- obj.save()
- logging.info(f'{collection.acron3} - {sample_size} - {buffer_size}')
- return obj
-
- class Meta:
- verbose_name = _('Collection Validation Parameters')
- verbose_name_plural = _('Collection Validation Parameters')
-
-
-class CollectionURLTranslatorClass(CommonControlField):
- collection = models.ForeignKey(
- Collection,
- verbose_name=_('Collection'),
- on_delete=models.DO_NOTHING,
- )
- directory = models.ForeignKey(
- CollectionLogDirectory,
- verbose_name=_('Directory'),
- on_delete=models.DO_NOTHING,
- )
- translator_class = models.CharField(
- verbose_name=_('URL Translator Class'),
- blank=False,
- null=False,
- default='URLTranslatorClassicSite',
- )
-
- def __str__(self):
- return f'{self.collection.acron3} - {self.directory} - {self.translator_class}'
-
- class Meta:
- verbose_name = _('Collection URL Translator Class')
- verbose_name_plural = _('Collection URL Translator Classes')
- constraints = [
- models.UniqueConstraint(fields=['collection', 'directory'], name='unique_collection_directory')
+ models.UniqueConstraint(fields=['config', 'email'], name='unique_config_email')
]
- @classmethod
- def load(cls, data, user):
- for item in data:
- try:
- collection = Collection.objects.get(acron3=item.get('acronym'))
- except Collection.DoesNotExist:
- logging.warning(f'Collection {item.get("acronym")} not found.')
- continue
-
- try:
- directory = CollectionLogDirectory.objects.get(collection=collection, path=item.get('path'))
- logging.info(item)
- cls.create_or_update(
- user=user,
- collection=collection,
- directory=directory,
- translator_class=item.get('translator_class'),
- )
- except CollectionLogDirectory.DoesNotExist:
- logging.warning(f'Directory {item.get("path")} not found.')
- continue
- @classmethod
- def create_or_update(
- cls,
- user,
- collection,
- directory,
- translator_class,
- ):
- try:
- obj = cls.objects.get(collection=collection)
- except cls.DoesNotExist:
- obj = cls()
- obj.creator = user
- obj.created = timezone.now()
- obj.collection = collection
- obj.directory = directory
-
- obj.updated_by = user
- obj.updated = timezone.now()
- obj.translator_class = translator_class
-
- obj.save()
- logging.info(f'{collection.acron3} - {directory.path} - {translator_class}')
- return obj
-
-
-class SupportedLogFile(CommonControlField):
- file_extension = models.CharField(
- verbose_name=_('File Extension'),
- max_length=255,
- unique=True,
- blank=False,
- null=False,
- )
- description = models.TextField(
- verbose_name=_('Description'),
- blank=True,
- null=True,
- )
-
- def __str__(self):
- return f'{self.file_extension}'
-
- @classmethod
- def load(cls, data, user):
- for item in data:
- logging.info(item)
- cls.create_or_update(
- user=user,
- file_extension=item.get('file_extension'),
- description=item.get('description'),
- )
-
- @classmethod
- def create_or_update(
- cls,
- user,
- file_extension,
- description,
- ):
- try:
- obj = cls.objects.get(file_extension=file_extension)
- except cls.DoesNotExist:
- obj = cls()
- obj.creator = user
- obj.created = timezone.now()
-
- obj.updated_by = user
- obj.updated = timezone.now()
- obj.file_extension = file_extension
- obj.description = description
-
- obj.save()
- logging.info(f'{file_extension}')
- return obj
-
- class Meta:
- verbose_name = _('Supported Log File')
- verbose_name_plural = _('Supported Log Files')
diff --git a/log_manager_config/tasks.py b/log_manager_config/tasks.py
index f15262b..c4ff399 100644
--- a/log_manager_config/tasks.py
+++ b/log_manager_config/tasks.py
@@ -1,59 +1,25 @@
-from django.contrib.auth import get_user_model
+from django.conf import settings
from django.utils.translation import gettext as _
-from core.utils.utils import _get_user
from config import celery_app
+from config.collections import COLLECTION_SIZE_SAMPLE_MAP, LOG_MANAGER_SEED_DATA
+from core.utils.request_utils import _get_user
from . import models
-User = get_user_model()
-
-
-@celery_app.task(bind=True, name=_('Load log manager collection settings'))
-def task_load_log_manager_collection_settings(self, data={}, user_id=None, username=None):
+@celery_app.task(bind=True, name=_('[Log Pipeline] Load Log Manager Settings (Seed)'))
+def task_load_log_manager_collection_settings(self, data=None, user_id=None, username=None):
user = _get_user(self.request, username=username, user_id=user_id)
if not data:
- data = [
- {'acronym': 'arg', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.ar', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'},
- {'acronym': 'bol', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.bo', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'},
- {'acronym': 'chl', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.cl', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'},
- {'acronym': 'col', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.co', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'},
- {'acronym': 'cri', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.cr', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'},
- {'acronym': 'cub', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.cu', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'},
- {'acronym': 'data', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-dataverse', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'dataverse'},
- {'acronym': 'ecu', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.ec', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'},
- {'acronym': 'esp', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.es', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'},
- {'acronym': 'mex', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.mx', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'},
- {'acronym': 'per', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.pe', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'},
- {'acronym': 'preprints', 'directory_name': _('Site clássico') , 'path': '/app/logs/submission-node01', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'preprints'},
- {'acronym': 'prt', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.pt', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'},
- {'acronym': 'pry', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.py', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'},
- {'acronym': 'psi', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.pepsic', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'},
- {'acronym': 'rve', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.revenf', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'},
- {'acronym': 'rvt', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.revtur', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'},
- {'acronym': 'scl', 'directory_name': _('Site novo') , 'path': '/app/logs/bkp-ratchet/scielo.nbr', 'quantity': 2, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'opac'},
- {'acronym': 'spa', 'directory_name': _('Site novo - versão prévia') , 'path': '/app/logs/bkp-ratchet/scielo.sp', 'quantity': 2, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'opac_alpha'},
- {'acronym': 'sss', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.ss', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'},
- {'acronym': 'sza', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.za', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'},
- {'acronym': 'ury', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.uy', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'},
- {'acronym': 'ven', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.ve', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'},
- {'acronym': 'wid', 'directory_name': _('Site clássico') , 'path': '/app/logs/bkp-ratchet/scielo.wi', 'quantity': 1, 'start_date': '2020-01-01', 'e-mail': 'tecnologia@scielo.org', 'translator_class': 'classic'},
- ]
+ data = LOG_MANAGER_SEED_DATA
for i in data:
- i['sample_size'] = 0.1 if i['acronym'] not in ['data', 'preprints', 'wid'] else 1.0
+ size = getattr(settings, 'COLLECTION_ACRON3_SIZE_MAP', {}).get(i['acronym'], 'small')
+ i['sample_size'] = COLLECTION_SIZE_SAMPLE_MAP.get(size, 1.0)
i['buffer_size'] = 2048
- data_extensions = [
- {'file_extension': '.log', 'description': ''},
- {'file_extension': '.gz', 'description': ''}
- ]
-
+ models.LogManagerCollectionConfig.load(data, user)
models.CollectionLogDirectory.load(data, user)
models.CollectionEmail.load(data, user)
- models.CollectionLogFilesPerDay.load(data, user)
- models.CollectionValidationParameters.load(data, user)
- models.CollectionURLTranslatorClass.load(data, user)
- models.SupportedLogFile.load(data_extensions, user)
\ No newline at end of file
diff --git a/log_manager_config/wagtail_hooks.py b/log_manager_config/wagtail_hooks.py
index 2ecf908..f91c0b1 100644
--- a/log_manager_config/wagtail_hooks.py
+++ b/log_manager_config/wagtail_hooks.py
@@ -1,134 +1,24 @@
from django.utils.translation import gettext_lazy as _
-from wagtail.snippets.views.snippets import SnippetViewSet, SnippetViewSetGroup
-from wagtail.snippets.models import register_snippet
+from wagtail.snippets.views.snippets import SnippetViewSet
-from config.menu import get_menu_order
+from log_manager_config.models import LogManagerCollectionConfig
-from log_manager_config.models import (
- CollectionLogDirectory,
- CollectionLogFilesPerDay,
- CollectionEmail,
- CollectionValidationParameters,
- CollectionURLTranslatorClass,
- SupportedLogFile,
-)
-
-
-class CollectionLogDirectorySnippetViewSet(SnippetViewSet):
- model = CollectionLogDirectory
- menu_label = _("Collection Log Directory")
- icon = "folder"
+class LogManagerCollectionConfigSnippetViewSet(SnippetViewSet):
+ model = LogManagerCollectionConfig
+ menu_label = _("Log Manager Configurations")
+ icon = "cogs"
menu_order = 300
- list_display = (
- "collection",
- "directory_name",
- "path",
- "active",
- )
- list_filter = (
- "collection",
- "active",
- )
- search_fields = (
- "path",
- )
-
-
-class CollectionLogFilesPerDaySnippetViewSet(SnippetViewSet):
- model = CollectionLogFilesPerDay
- menu_label = _("Collection Log Files Per Day")
- icon = "folder"
- menu_order = 400
-
- list_display = (
- "collection",
- "start_date",
- "end_date",
- "quantity",
- )
- list_filter = (
- "collection",
- )
-
-
-class CollectionEmailSnippetViewSet(SnippetViewSet):
- model = CollectionEmail
- menu_label = _("Collection Email")
- icon = "folder"
- menu_order = 500
-
- list_display = (
- "collection",
- "name",
- "position",
- "email",
- "active",
- )
- list_filter = (
- "collection",
- "active",
- )
- search_fields = (
- "name",
- "email"
- )
-
-class CollectionValidationParametersSnippetViewSet(SnippetViewSet):
- model = CollectionValidationParameters
- menu_label = _("Collection Validation Parameters")
- icon = "folder"
- menu_order = 550
-
list_display = (
"collection",
"sample_size",
"buffer_size",
+ "expected_logs_per_day",
+ "updated",
)
list_filter = (
"collection",
)
-
-class CollectionURLTranslatorClassSnippetViewSet(SnippetViewSet):
- model = CollectionURLTranslatorClass
- menu_label = _("Collection URL Translator Class")
- icon = "folder"
- menu_order = 600
-
- list_display = (
- "collection",
- "directory",
- "translator_class",
- )
- list_filter = (
- "collection",
- )
-
-class SupportedLogFileSnippetViewSet(SnippetViewSet):
- model = SupportedLogFile
- menu_label = _("Supported Log File Formats")
- icon = "folder"
- menu_order = 600
-
- list_display = (
- "file_extension",
- "description",
- )
-
-
-class LogManagerConfigSnippetViewSetGroup(SnippetViewSetGroup):
- menu_name = 'log_manager_config'
- menu_label = _("Log Manager Config")
- menu_icon = "folder-open-inverse"
- menu_order = get_menu_order("log_manager_config")
- items = (
- CollectionLogDirectorySnippetViewSet,
- CollectionLogFilesPerDaySnippetViewSet,
- CollectionEmailSnippetViewSet,
- CollectionValidationParametersSnippetViewSet,
- CollectionURLTranslatorClassSnippetViewSet,
- SupportedLogFileSnippetViewSet,
+ search_fields = (
+ "collection__acron3",
)
-
-
-register_snippet(LogManagerConfigSnippetViewSetGroup)
diff --git a/merge_production_dotenvs_in_dotenv.py b/merge_production_dotenvs_in_dotenv.py
deleted file mode 100644
index d1170ef..0000000
--- a/merge_production_dotenvs_in_dotenv.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import os
-from pathlib import Path
-from typing import Sequence
-
-import pytest
-
-ROOT_DIR_PATH = Path(__file__).parent.resolve()
-PRODUCTION_DOTENVS_DIR_PATH = ROOT_DIR_PATH / ".envs" / ".production"
-PRODUCTION_DOTENV_FILE_PATHS = [
- PRODUCTION_DOTENVS_DIR_PATH / ".django",
- PRODUCTION_DOTENVS_DIR_PATH / ".postgres",
-]
-DOTENV_FILE_PATH = ROOT_DIR_PATH / ".env"
-
-
-def merge(
- output_file_path: str, merged_file_paths: Sequence[str], append_linesep: bool = True
-) -> None:
- with open(output_file_path, "w") as output_file:
- for merged_file_path in merged_file_paths:
- with open(merged_file_path, "r") as merged_file:
- merged_file_content = merged_file.read()
- output_file.write(merged_file_content)
- if append_linesep:
- output_file.write(os.linesep)
-
-
-def main():
- merge(DOTENV_FILE_PATH, PRODUCTION_DOTENV_FILE_PATHS)
-
-
-@pytest.mark.parametrize("merged_file_count", range(3))
-@pytest.mark.parametrize("append_linesep", [True, False])
-def test_merge(tmpdir_factory, merged_file_count: int, append_linesep: bool):
- tmp_dir_path = Path(str(tmpdir_factory.getbasetemp()))
-
- output_file_path = tmp_dir_path / ".env"
-
- expected_output_file_content = ""
- merged_file_paths = []
- for i in range(merged_file_count):
- merged_file_ord = i + 1
-
- merged_filename = ".service{}".format(merged_file_ord)
- merged_file_path = tmp_dir_path / merged_filename
-
- merged_file_content = merged_filename * merged_file_ord
-
- with open(merged_file_path, "w+") as file:
- file.write(merged_file_content)
-
- expected_output_file_content += merged_file_content
- if append_linesep:
- expected_output_file_content += os.linesep
-
- merged_file_paths.append(merged_file_path)
-
- merge(output_file_path, merged_file_paths, append_linesep)
-
- with open(output_file_path, "r") as output_file:
- actual_output_file_content = output_file.read()
-
- assert actual_output_file_content == expected_output_file_content
-
-
-if __name__ == "__main__":
- main()
diff --git a/metrics/counter/__init__.py b/metrics/counter/__init__.py
new file mode 100644
index 0000000..c9afd92
--- /dev/null
+++ b/metrics/counter/__init__.py
@@ -0,0 +1,22 @@
+from .access import (
+ extract_item_access_data,
+ is_valid_item_access_data,
+ update_results_with_item_access_data,
+)
+from .documents import convert_raw_results_to_index_documents
+from .identifiers import (
+ generate_item_access_id,
+ generate_month_document_id,
+ generate_user_session_id,
+ generate_year_document_id,
+)
+from .parser import (
+ extract_date_from_validation_dict,
+ translator_class_name_to_obj,
+)
+from metrics.opensearch.names import (
+ extract_access_month,
+ extract_access_year,
+ generate_month_index_name,
+ generate_year_index_name,
+)
diff --git a/metrics/counter/access.py b/metrics/counter/access.py
new file mode 100644
index 0000000..12c0cc5
--- /dev/null
+++ b/metrics/counter/access.py
@@ -0,0 +1,438 @@
+import re
+from urllib.parse import unquote, urlparse
+
+from scielo_usage_counter.values import (
+ CONTENT_TYPE_UNDEFINED,
+ DEFAULT_SCIELO_ISSN,
+ MEDIA_LANGUAGE_UNDEFINED,
+ MEDIA_FORMAT_UNDEFINED,
+)
+
+from core.utils.standardizer import (
+ standardize_language_code,
+ standardize_pid_generic,
+ standardize_pid_v2,
+ standardize_pid_v3,
+ standardize_year_of_publication,
+)
+from core.utils.date_utils import extract_minute_second_key, truncate_datetime_to_hour
+from metrics.counter.identifiers import generate_item_access_id, generate_user_session_id
+
+
+def extract_item_access_data(collection_acron3: str, translated_url: dict):
+ if not translated_url or not isinstance(translated_url, dict):
+ return {}
+
+ source_type = _extract_source_type(collection_acron3, translated_url)
+ source_id = _extract_source_id(collection_acron3, translated_url, source_type)
+ scielo_issn = _extract_scielo_issn(translated_url, source_type, source_id)
+ document_type = _extract_document_type(collection_acron3, translated_url, source_type)
+ publication_year = _safe_standardize(
+ standardize_year_of_publication,
+ translated_url.get("year_of_publication"),
+ )
+ source_access_type = translated_url.get("source_access_type")
+
+ return {
+ "collection": collection_acron3,
+ "source_type": source_type,
+ "source_id": source_id,
+ "scielo_issn": scielo_issn,
+ "document_type": document_type,
+ "pid_v2": _safe_standardize(standardize_pid_v2, translated_url.get("pid_v2")),
+ "pid_v3": _safe_standardize(standardize_pid_v3, translated_url.get("pid_v3")),
+ "pid_generic": _safe_standardize(
+ standardize_pid_generic,
+ translated_url.get("pid_generic"),
+ ),
+ "title_pid_generic": _safe_standardize(
+ standardize_pid_generic,
+ translated_url.get("title_pid_generic"),
+ ),
+ "segment_pid_generics": _standardize_pid_generic_list(
+ translated_url.get("segment_pid_generics"),
+ ),
+ "media_language": _safe_standardize(
+ standardize_language_code,
+ translated_url.get("media_language"),
+ default="un",
+ ),
+ "media_format": translated_url.get("media_format"),
+ "content_type": translated_url.get("content_type"),
+ "access_url": translated_url.get("access_url") or translated_url.get("normalized_url"),
+ "publication_year": publication_year,
+ "counter_access_type": _counter_access_type(source_access_type),
+ "access_method": "Regular",
+ "source_main_title": _extract_source_title(translated_url),
+ "source_subject_area_capes": translated_url.get("source_subject_area_capes")
+ or translated_url.get("journal_subject_area_capes"),
+ "source_subject_area_wos": translated_url.get("source_subject_area_wos")
+ or translated_url.get("journal_subject_area_wos"),
+ "source_acronym": translated_url.get("source_acronym")
+ or translated_url.get("journal_acronym"),
+ "source_publisher_name": translated_url.get("source_publisher_name")
+ or translated_url.get("journal_publisher_name"),
+ "source_access_type": source_access_type,
+ "source_identifiers": _extract_source_identifiers(translated_url, source_id, source_type),
+ "source_city": translated_url.get("source_city"),
+ "source_country": translated_url.get("source_country"),
+ }
+
+
+def is_valid_item_access_data(data: dict, utm=None, ignore_utm_validation=False):
+ if not isinstance(data, dict):
+ return False, {"message": "Invalid data format. Expected a dictionary.", "code": "invalid_format"}
+
+ scielo_issn = data.get("scielo_issn")
+ source_id = data.get("source_id")
+ source_type = data.get("source_type")
+ document_type = data.get("document_type") or "article"
+ media_format = data.get("media_format")
+ media_language = data.get("media_language")
+ content_type = data.get("content_type")
+ pid_v2 = data.get("pid_v2")
+ pid_v3 = data.get("pid_v3")
+ pid_generic = data.get("pid_generic")
+ has_source_identity = bool(source_id) or bool(
+ scielo_issn and scielo_issn != DEFAULT_SCIELO_ISSN
+ )
+ has_media_language = bool(media_language and media_language != MEDIA_LANGUAGE_UNDEFINED)
+ has_pid = bool(pid_v2 or pid_v3 or pid_generic)
+
+ if not all([media_format and media_format != MEDIA_FORMAT_UNDEFINED, content_type and content_type != CONTENT_TYPE_UNDEFINED, has_pid]):
+ return False, {"message": "Missing required fields in item access data.", "code": "missing_fields"}
+
+ if document_type in {"article", "book", "chapter"} and not has_media_language:
+ return False, {"message": "Missing media language in item access data.", "code": "missing_fields"}
+
+ if document_type == "article" and not has_source_identity:
+ return False, {"message": "Missing article source identity.", "code": "missing_fields"}
+
+ if document_type in {"book", "chapter"} and not source_id:
+ return False, {"message": "Missing book source identity.", "code": "missing_fields"}
+
+ if document_type in {"preprint", "dataset"} and not pid_generic:
+ return False, {"message": "Missing generic PID in item access data.", "code": "missing_fields"}
+
+ if utm and not ignore_utm_validation:
+ if (
+ source_type == "journal"
+ and scielo_issn
+ and scielo_issn != DEFAULT_SCIELO_ISSN
+ and not utm.is_valid_code(scielo_issn, utm.sources_metadata["issn_set"])
+ ):
+ return False, {"message": f"Invalid scielo_issn: {scielo_issn}", "code": "invalid_scielo_issn"}
+
+ if (
+ source_type
+ and source_type != "journal"
+ and source_id
+ and source_id not in utm.sources_metadata.get("source_id_to_type", {})
+ ):
+ return False, {"message": f"Invalid source_id: {source_id}", "code": "invalid_source_id"}
+
+ if pid_v2 and not utm.is_valid_code(pid_v2, utm.documents_metadata["pid_set"]):
+ return False, {"message": f"Invalid pid_v2: {pid_v2}", "code": "invalid_pid_v2"}
+
+ if pid_v3 and not utm.is_valid_code(pid_v3, utm.documents_metadata["pid_set"]):
+ return False, {"message": f"Invalid pid_v3: {pid_v3}", "code": "invalid_pid_v3"}
+
+ if pid_generic and not utm.is_valid_code(pid_generic, utm.documents_metadata["pid_set"]):
+ return False, {"message": f"Invalid pid_generic: {pid_generic}", "code": "invalid_pid_generic"}
+
+ return True, {"message": "Item access data is valid.", "code": "valid"}
+
+
+def update_results_with_item_access_data(results: dict, item_access_data: dict, line: dict):
+ col_acron3 = item_access_data.get("collection")
+ source_key = (
+ item_access_data.get("source_id")
+ or item_access_data.get("scielo_issn")
+ or item_access_data.get("source_type")
+ or col_acron3
+ )
+ pid_v2 = item_access_data.get("pid_v2")
+ pid_v3 = item_access_data.get("pid_v3")
+ media_format = item_access_data.get("media_format")
+ content_language = item_access_data.get("media_language")
+ content_type = item_access_data.get("content_type")
+ access_url = item_access_data.get("access_url") or _normalize_access_url(line.get("url"))
+
+ client_name = line.get("client_name")
+ client_version = line.get("client_version")
+ local_datetime = line.get("local_datetime")
+ access_country_code = line.get("country_code")
+ ip_address = line.get("ip_address")
+
+ truncated_datetime = truncate_datetime_to_hour(local_datetime)
+ ms_key = extract_minute_second_key(local_datetime)
+ if truncated_datetime is None or ms_key is None:
+ raise ValueError("Invalid local_datetime in parsed log line.")
+
+ access_date = truncated_datetime.strftime("%Y-%m-%d")
+ access_year = access_date[:4]
+ access_month = access_date[:7].replace("-", "")
+
+ user_session_id = generate_user_session_id(
+ client_name,
+ client_version,
+ ip_address,
+ truncated_datetime,
+ )
+
+ for access_target in _iter_access_targets(item_access_data):
+ item_access_id = generate_item_access_id(
+ user_session_id=user_session_id,
+ col_acron3=col_acron3,
+ source_key=source_key,
+ pid_v2=pid_v2,
+ pid_v3=pid_v3,
+ pid_generic=access_target.get("pid_generic"),
+ content_language=content_language,
+ access_country_code=access_country_code,
+ media_format=media_format,
+ content_type=content_type,
+ )
+
+ if item_access_id not in results:
+ results[item_access_id] = {
+ "collection": col_acron3,
+ "source_key": source_key,
+ "document_type": access_target.get("document_type"),
+ "pid_v2": pid_v2,
+ "pid_v3": pid_v3,
+ "pid_generic": access_target.get("pid_generic"),
+ "title_pid_generic": (
+ item_access_data.get("title_pid_generic")
+ or access_target.get("pid_generic")
+ ),
+ "user_session_id": user_session_id,
+ "click_timestamps": {ms_key: 0},
+ "click_timestamps_by_url": {},
+ "access_url": access_url,
+ "media_format": media_format,
+ "content_language": content_language,
+ "content_type": content_type,
+ "access_country_code": access_country_code,
+ "access_date": access_date,
+ "access_year": access_year,
+ "access_month": access_month,
+ "publication_year": item_access_data.get("publication_year"),
+ "counter_access_type": item_access_data.get("counter_access_type") or "Open",
+ "access_method": item_access_data.get("access_method") or "Regular",
+ "source": {
+ "source_type": item_access_data.get("source_type"),
+ "source_id": item_access_data.get("source_id"),
+ "scielo_issn": item_access_data.get("scielo_issn"),
+ "main_title": item_access_data.get("source_main_title"),
+ "identifiers": item_access_data.get("source_identifiers"),
+ "access_type": item_access_data.get("source_access_type"),
+ "city": item_access_data.get("source_city"),
+ "country": item_access_data.get("source_country"),
+ "subject_area_capes": item_access_data.get("source_subject_area_capes"),
+ "subject_area_wos": item_access_data.get("source_subject_area_wos"),
+ "acronym": item_access_data.get("source_acronym"),
+ "publisher_name": item_access_data.get("source_publisher_name"),
+ },
+ }
+
+ if ms_key not in results[item_access_id]["click_timestamps"]:
+ results[item_access_id]["click_timestamps"][ms_key] = 0
+
+ results[item_access_id]["click_timestamps"][ms_key] += 1
+
+ access_url_key = access_url or _fallback_access_url_key(
+ access_target.get("pid_generic"),
+ media_format,
+ content_type,
+ )
+ timestamps_by_url = results[item_access_id].setdefault("click_timestamps_by_url", {})
+ url_timestamps = timestamps_by_url.setdefault(access_url_key, {})
+ if ms_key not in url_timestamps:
+ url_timestamps[ms_key] = 0
+ url_timestamps[ms_key] += 1
+
+
+def _extract_source_type(collection_acron3, translated_url):
+ source_type = translated_url.get("source_type")
+ if source_type:
+ return source_type
+
+ if collection_acron3 == "preprints":
+ return "preprint_server"
+
+ if collection_acron3 == "data":
+ return "data_repository"
+
+ if collection_acron3 == "books":
+ return "book"
+
+ if translated_url.get("book_id"):
+ return "book"
+
+ if (
+ translated_url.get("scielo_issn")
+ and translated_url.get("scielo_issn") != DEFAULT_SCIELO_ISSN
+ ):
+ return "journal"
+
+ if translated_url.get("journal_acronym") or translated_url.get("journal_main_title"):
+ return "journal"
+
+ return "other"
+
+
+def _extract_source_id(collection_acron3, translated_url, source_type):
+ source_id = translated_url.get("source_id")
+ if source_id:
+ return source_id
+
+ if source_type == "preprint_server":
+ return translated_url.get("preprint_server_id") or "scielo-preprints"
+
+ if source_type == "data_repository":
+ return translated_url.get("repository_id") or "scielo-data"
+
+ if source_type == "book":
+ return (
+ translated_url.get("book_id")
+ or _extract_book_id_from_pid(translated_url.get("title_pid_generic"))
+ or _extract_book_id_from_pid(translated_url.get("pid_generic"))
+ )
+
+ if source_type == "journal":
+ return translated_url.get("scielo_issn")
+
+ return None
+
+
+def _extract_scielo_issn(translated_url, source_type, source_id):
+ scielo_issn = translated_url.get("scielo_issn")
+ if scielo_issn:
+ return scielo_issn
+
+ if source_type == "journal" and source_id:
+ return source_id
+
+ if source_type in {"book", "other"} or translated_url.get("book_id"):
+ return DEFAULT_SCIELO_ISSN
+
+ return None
+
+
+def _extract_source_title(translated_url):
+ return (
+ translated_url.get("source_main_title")
+ or translated_url.get("journal_main_title")
+ or translated_url.get("book_title")
+ )
+
+
+def _extract_document_type(collection_acron3, translated_url, source_type):
+ document_type = translated_url.get("document_type")
+ if document_type:
+ return document_type
+
+ if collection_acron3 == "preprints":
+ return "preprint"
+
+ if collection_acron3 == "data":
+ return "dataset"
+
+ if collection_acron3 == "books" or source_type == "book":
+ pid_generic = translated_url.get("pid_generic") or ""
+ if translated_url.get("chapter_id") or "/CHAPTER:" in pid_generic.upper():
+ return "chapter"
+ if translated_url.get("book_id"):
+ return "book"
+ return "book"
+
+ if source_type == "journal":
+ return "article"
+
+ return "article"
+
+
+def _extract_source_identifiers(translated_url, source_id, source_type):
+ identifiers = translated_url.get("source_identifiers")
+ if isinstance(identifiers, dict):
+ compact = {key: value for key, value in identifiers.items() if value not in (None, "", [], {}, ())}
+ if compact:
+ return compact
+
+ if source_type != "book":
+ return None
+
+ compact = {
+ "book_id": source_id or translated_url.get("book_id"),
+ "isbn": translated_url.get("isbn"),
+ "eisbn": translated_url.get("eisbn"),
+ "doi": translated_url.get("doi"),
+ }
+ compact = {key: value for key, value in compact.items() if value not in (None, "", [], {}, ())}
+ return compact or None
+
+
+def _extract_book_id_from_pid(value):
+ if not value:
+ return None
+ normalized = str(value).upper()
+ if not normalized.startswith("BOOK:"):
+ return None
+ return normalized.split("BOOK:", 1)[1].split("/", 1)[0] or None
+
+
+def _counter_access_type(source_access_type):
+ normalized = str(source_access_type or "").strip().lower()
+ if normalized == "commercial":
+ return "Controlled"
+ if normalized in {"free_to_read", "free-to-read", "free"}:
+ return "Free_To_Read"
+ return "Open"
+
+
+def _safe_standardize(func, value, default=""):
+ try:
+ return func(value)
+ except Exception:
+ return default
+
+
+def _standardize_pid_generic_list(values):
+ if not isinstance(values, (list, tuple, set)):
+ return []
+ items = []
+ for value in values:
+ item = _safe_standardize(standardize_pid_generic, value)
+ if item and item not in items:
+ items.append(item)
+ return items
+
+
+def _iter_access_targets(item_access_data):
+ return [
+ {
+ "pid_generic": item_access_data.get("pid_generic"),
+ "document_type": item_access_data.get("document_type"),
+ }
+ ]
+
+
+def _normalize_access_url(url):
+ if not url:
+ return None
+ parsed_url = urlparse(str(url).strip())
+ path = parsed_url.path if parsed_url.scheme or parsed_url.netloc else str(url).strip()
+ path = unquote(path or "")
+ path = path.split("?", 1)[0].split("#", 1)[0].split()[0]
+ path = re.sub(r"/+", "/", path)
+ path = path.rstrip(".,;:")
+ return path or None
+
+
+def _fallback_access_url_key(pid_generic, media_format, content_type):
+ return "|".join([
+ str(pid_generic or ""),
+ str(media_format or ""),
+ str(content_type or ""),
+ ])
diff --git a/metrics/counter/aggregation.py b/metrics/counter/aggregation.py
new file mode 100644
index 0000000..d047e7a
--- /dev/null
+++ b/metrics/counter/aggregation.py
@@ -0,0 +1,124 @@
+from scielo_usage_counter.counter import get_valid_clicks, is_request
+
+
+def apply_unique_metrics(
+ document,
+ unique_state,
+ scope,
+ document_id,
+ user_session_id,
+ is_request_event,
+):
+ if not user_session_id:
+ return
+
+ inv_bucket = unique_state[f"{scope}_investigations"]
+ inv_key = (document_id, user_session_id)
+ add_investigation = inv_key not in inv_bucket
+ if add_investigation:
+ inv_bucket.add(inv_key)
+
+ add_request = False
+ if is_request_event:
+ req_bucket = unique_state[f"{scope}_requests"]
+ req_key = (document_id, user_session_id)
+ add_request = req_key not in req_bucket
+ if add_request:
+ req_bucket.add(req_key)
+
+ increment_document_uniques(
+ document=document,
+ add_investigation=add_investigation,
+ add_request=add_request,
+ )
+
+
+def increment_document_totals(document, click_timestamps, content_type, click_timestamps_by_url=None):
+ number_of_clicks = _count_valid_clicks(
+ click_timestamps=click_timestamps,
+ click_timestamps_by_url=click_timestamps_by_url,
+ )
+
+ document["total_investigations"] += number_of_clicks
+ if is_request(content_type):
+ document["total_requests"] += number_of_clicks
+
+ if "daily_metrics" in document:
+ day_key = list(document["daily_metrics"].keys())[0]
+ document["daily_metrics"][day_key]["total_investigations"] += number_of_clicks
+ if is_request(content_type):
+ document["daily_metrics"][day_key]["total_requests"] += number_of_clicks
+
+
+def _count_valid_clicks(click_timestamps, click_timestamps_by_url=None):
+ if isinstance(click_timestamps_by_url, dict) and click_timestamps_by_url:
+ return sum(
+ get_valid_clicks(timestamps or {})
+ for timestamps in click_timestamps_by_url.values()
+ )
+ return get_valid_clicks(click_timestamps or {})
+
+
+def increment_document_uniques(document, add_investigation=False, add_request=False):
+ if add_investigation:
+ document["unique_investigations"] += 1
+ if add_request:
+ document["unique_requests"] += 1
+
+ if "daily_metrics" in document:
+ day_key = list(document["daily_metrics"].keys())[0]
+ if add_investigation:
+ document["daily_metrics"][day_key]["unique_investigations"] += 1
+ if add_request:
+ document["daily_metrics"][day_key]["unique_requests"] += 1
+
+
+def counter_data_type(document_type):
+ if document_type == "dataset":
+ return "Dataset"
+ if document_type in {"article", "preprint"}:
+ return "Article"
+ if document_type == "book":
+ return "Book"
+ if document_type == "chapter":
+ return "Book_Segment"
+ return "Other"
+
+
+def parent_data_type(document_type, source_type=None):
+ if document_type == "chapter":
+ return "Book"
+ if document_type == "article" and source_type == "journal":
+ return "Journal"
+ return None
+
+
+def article_version(document_type):
+ if document_type == "preprint":
+ return "Preprint"
+ return None
+
+
+def should_create_book_item_document(value):
+ if not value.get("pid_generic"):
+ return False
+ if value.get("document_type") == "book" and not is_request(value.get("content_type")):
+ return False
+ return True
+
+
+def extract_title_pid_generic(value, fallback=None):
+ title_pid_generic = value.get("title_pid_generic")
+ if title_pid_generic:
+ return title_pid_generic
+
+ pid_generic = value.get("pid_generic")
+ if "/CHAPTER:" in (pid_generic or "").upper():
+ return pid_generic.upper().split("/CHAPTER:")[0]
+
+ source = value.get("source") or {}
+ source_id = source.get("source_id")
+ if source_id:
+ return f"BOOK:{str(source_id).upper()}"
+
+ return fallback
diff --git a/metrics/counter/documents.py b/metrics/counter/documents.py
new file mode 100644
index 0000000..63730ae
--- /dev/null
+++ b/metrics/counter/documents.py
@@ -0,0 +1,322 @@
+from scielo_usage_counter.counter import is_request
+
+from metrics.counter.aggregation import (
+ apply_unique_metrics,
+ article_version,
+ counter_data_type,
+ extract_title_pid_generic,
+ increment_document_totals,
+ parent_data_type,
+ should_create_book_item_document,
+)
+from metrics.counter.identifiers import generate_month_document_id, generate_year_document_id
+
+
+def convert_to_month_index_documents(data: dict):
+ if not isinstance(data, dict):
+ return {}
+
+ metrics_data = {}
+ unique_state = _initialize_unique_state()
+
+ for value in data.values():
+ _accumulate_documents(
+ data=metrics_data,
+ unique_state=unique_state,
+ value=value,
+ granularity="month",
+ )
+
+ return metrics_data
+
+
+def convert_to_year_index_documents(data: dict):
+ if not isinstance(data, dict):
+ return {}
+
+ metrics_data = {}
+ unique_state = _initialize_unique_state()
+
+ for value in data.values():
+ _accumulate_documents(
+ data=metrics_data,
+ unique_state=unique_state,
+ value=value,
+ granularity="year",
+ )
+
+ return metrics_data
+
+
+def convert_raw_results_to_index_documents(data: dict):
+ return {
+ "month": convert_to_month_index_documents(data),
+ "year": convert_to_year_index_documents(data),
+ }
+
+
+def _initialize_unique_state():
+ return {
+ "item_investigations": set(),
+ "item_requests": set(),
+ "title_investigations": set(),
+ "title_requests": set(),
+ }
+
+
+def _accumulate_documents(data, unique_state, value, granularity):
+ if not isinstance(value, dict):
+ return
+
+ if value.get("collection") == "books":
+ _accumulate_books_documents(data, unique_state, value, granularity)
+ return
+
+ _accumulate_standard_documents(data, unique_state, value, granularity)
+
+
+def _accumulate_standard_documents(data, unique_state, value, granularity):
+ document_id = _generate_document_id(value, granularity)
+ document = data.setdefault(
+ document_id,
+ _build_base_document(value=value, granularity=granularity),
+ )
+
+ increment_document_totals(
+ document=document,
+ click_timestamps=value.get("click_timestamps"),
+ click_timestamps_by_url=value.get("click_timestamps_by_url"),
+ content_type=value.get("content_type"),
+ )
+ apply_unique_metrics(
+ document=document,
+ unique_state=unique_state,
+ scope="item",
+ document_id=document_id,
+ user_session_id=value.get("user_session_id"),
+ is_request_event=is_request(value.get("content_type")),
+ )
+
+
+def _accumulate_books_documents(data, unique_state, value, granularity):
+ if should_create_book_item_document(value):
+ item_document_id = _generate_document_id(
+ value,
+ granularity,
+ metric_scope="item",
+ )
+ item_document = data.setdefault(
+ item_document_id,
+ _build_base_document(
+ value=value,
+ granularity=granularity,
+ metric_scope="item",
+ ),
+ )
+ increment_document_totals(
+ document=item_document,
+ click_timestamps=value.get("click_timestamps"),
+ click_timestamps_by_url=value.get("click_timestamps_by_url"),
+ content_type=value.get("content_type"),
+ )
+ apply_unique_metrics(
+ document=item_document,
+ unique_state=unique_state,
+ scope="item",
+ document_id=item_document_id,
+ user_session_id=value.get("user_session_id"),
+ is_request_event=is_request(value.get("content_type")),
+ )
+
+ title_pid_generic = extract_title_pid_generic(value)
+ if not title_pid_generic:
+ return
+
+ title_document_id = _generate_document_id(
+ value,
+ granularity,
+ metric_scope="title",
+ pid_generic=title_pid_generic,
+ )
+ title_document = data.setdefault(
+ title_document_id,
+ _build_base_document(
+ value=value,
+ granularity=granularity,
+ metric_scope="title",
+ pid_generic=title_pid_generic,
+ document_type="book",
+ ),
+ )
+ increment_document_totals(
+ document=title_document,
+ click_timestamps=value.get("click_timestamps"),
+ click_timestamps_by_url=value.get("click_timestamps_by_url"),
+ content_type=value.get("content_type"),
+ )
+ apply_unique_metrics(
+ document=title_document,
+ unique_state=unique_state,
+ scope="title",
+ document_id=title_document_id,
+ user_session_id=value.get("user_session_id"),
+ is_request_event=is_request(value.get("content_type")),
+ )
+
+
+def _generate_document_id(value, granularity, metric_scope=None, pid_generic=None):
+ pid_generic = pid_generic or value.get("pid_generic")
+ publication_year = str(value.get("publication_year") or "0001")
+ if granularity == "month":
+ access_month = value.get("access_date", "")[:7] if value.get("access_date") else ""
+ return generate_month_document_id(
+ collection=value.get("collection"),
+ source_key=value.get("source_key"),
+ pid_v2=value.get("pid_v2"),
+ pid_v3=value.get("pid_v3"),
+ pid_generic=pid_generic,
+ access_month=access_month,
+ counter_access_type=value.get("counter_access_type") or "Open",
+ access_method=value.get("access_method") or "Regular",
+ publication_year=publication_year,
+ metric_scope="title" if metric_scope == "title" else None,
+ )
+
+ return generate_year_document_id(
+ collection=value.get("collection"),
+ source_key=value.get("source_key"),
+ pid_v2=value.get("pid_v2"),
+ pid_v3=value.get("pid_v3"),
+ pid_generic=pid_generic,
+ content_language=value.get("content_language"),
+ access_country_code=value.get("access_country_code"),
+ access_year=value.get("access_year"),
+ counter_access_type=value.get("counter_access_type") or "Open",
+ access_method=value.get("access_method") or "Regular",
+ publication_year=publication_year,
+ metric_scope="title" if metric_scope == "title" else None,
+ )
+
+
+def _build_base_document(value, granularity, metric_scope=None, pid_generic=None, document_type=None):
+ collection = value.get("collection")
+ if collection == "books":
+ normalized_pid_generic = pid_generic or value.get("pid_generic")
+ title_pid_generic = extract_title_pid_generic(value, fallback=normalized_pid_generic)
+ base_document = {
+ "collection": collection,
+ "source": _build_books_source(value.get("source")),
+ "document_type": document_type or value.get("document_type"),
+ "scielo_document_type": document_type or value.get("document_type"),
+ "metric_scope": metric_scope or "item",
+ "counter_data_type": "Book" if metric_scope == "title" else "Book_Segment",
+ "parent_data_type": "Book" if metric_scope != "title" else None,
+ "title_pid_generic": title_pid_generic,
+ "pid": normalized_pid_generic,
+ "pid_generic": normalized_pid_generic,
+ "publication_year": value.get("publication_year"),
+ "counter_access_type": value.get("counter_access_type") or "Open",
+ "access_method": value.get("access_method") or "Regular",
+ "total_requests": 0,
+ "total_investigations": 0,
+ "unique_requests": 0,
+ "unique_investigations": 0,
+ }
+ _apply_access_fields(base_document, value, granularity)
+ if granularity == "year":
+ base_document["content_language"] = value.get("content_language")
+ base_document["access_country_code"] = value.get("access_country_code")
+ return base_document
+
+ base_document = {
+ "collection": collection,
+ "source": _build_standard_source(value.get("source")),
+ "document_type": value.get("document_type"),
+ "scielo_document_type": value.get("document_type"),
+ "metric_scope": "item",
+ "counter_data_type": counter_data_type(value.get("document_type")),
+ "parent_data_type": parent_data_type(
+ value.get("document_type"),
+ (value.get("source") or {}).get("source_type"),
+ ),
+ "article_version": article_version(value.get("document_type")),
+ "pid": value.get("pid_v3") or value.get("pid_v2") or value.get("pid_generic"),
+ "pid_v2": value.get("pid_v2"),
+ "pid_v3": value.get("pid_v3"),
+ "pid_generic": value.get("pid_generic"),
+ "publication_year": value.get("publication_year"),
+ "counter_access_type": value.get("counter_access_type") or "Open",
+ "access_method": value.get("access_method") or "Regular",
+ "total_requests": 0,
+ "total_investigations": 0,
+ "unique_requests": 0,
+ "unique_investigations": 0,
+ }
+ _apply_access_fields(base_document, value, granularity)
+ if granularity == "year":
+ base_document["content_language"] = value.get("content_language")
+ base_document["access_country_code"] = value.get("access_country_code")
+ return base_document
+
+
+def _apply_access_fields(base_document, value, granularity):
+ if granularity == "month":
+ base_document["access_month"] = value.get("access_date", "")[:7] if value.get("access_date") else ""
+ day = value.get("access_date", "")[-2:] if value.get("access_date") else "01"
+ base_document["daily_metrics"] = {
+ day: {
+ "total_requests": 0,
+ "total_investigations": 0,
+ "unique_requests": 0,
+ "unique_investigations": 0,
+ }
+ }
+ return
+
+ base_document["access_year"] = value.get("access_year")
+
+
+def _build_books_source(source):
+ source = source or {}
+ identifiers = source.get("identifiers") or {}
+ compact_identifiers = {
+ key: value
+ for key, value in identifiers.items()
+ if key in {"book_id", "isbn", "eisbn", "doi"} and value not in (None, "", [], {}, ())
+ }
+
+ return {
+ "source_type": source.get("source_type"),
+ "source_id": source.get("source_id"),
+ "main_title": source.get("main_title"),
+ "access_type": source.get("access_type"),
+ "publisher": source.get("publisher_name"),
+ "city": source.get("city"),
+ "country": source.get("country"),
+ "identifiers": compact_identifiers,
+ }
+
+
+def _build_standard_source(source):
+ source = source or {}
+ identifiers = source.get("identifiers") or {}
+ compact_identifiers = {
+ key: value
+ for key, value in identifiers.items()
+ if value not in (None, "", [], {}, ())
+ }
+
+ return {
+ "source_type": source.get("source_type"),
+ "source_id": source.get("source_id"),
+ "scielo_issn": source.get("scielo_issn"),
+ "main_title": source.get("main_title"),
+ "acronym": source.get("acronym"),
+ "publisher_name": source.get("publisher_name"),
+ "subject_area_capes": source.get("subject_area_capes"),
+ "subject_area_wos": source.get("subject_area_wos"),
+ "access_type": source.get("access_type"),
+ "city": source.get("city"),
+ "country": source.get("country"),
+ "identifiers": compact_identifiers,
+ }
diff --git a/metrics/counter/identifiers.py b/metrics/counter/identifiers.py
new file mode 100644
index 0000000..bef7b8d
--- /dev/null
+++ b/metrics/counter/identifiers.py
@@ -0,0 +1,110 @@
+def generate_user_session_id(client_name, client_version, ip_address, datetime, sep="|"):
+ dt_year_month_day = datetime.strftime("%Y-%m-%d")
+ dt_hour = datetime.strftime("%H")
+
+ return sep.join(
+ [
+ str(client_name),
+ str(client_version),
+ str(ip_address),
+ str(dt_year_month_day),
+ str(dt_hour),
+ ]
+ )
+
+
+def generate_item_access_id(
+ col_acron3,
+ source_key,
+ pid_v2,
+ pid_v3,
+ pid_generic,
+ user_session_id,
+ access_country_code,
+ content_language,
+ media_format,
+ content_type,
+ sep="|",
+):
+ return sep.join(
+ [
+ col_acron3,
+ str(source_key or ""),
+ pid_v2 or "",
+ pid_v3 or "",
+ pid_generic or "",
+ str(user_session_id or ""),
+ str(access_country_code or ""),
+ str(content_language or ""),
+ str(media_format or ""),
+ str(content_type or ""),
+ ]
+ )
+
+
+def generate_month_document_id(
+ collection: str,
+ source_key: str,
+ pid_v2: str,
+ pid_v3: str,
+ pid_generic: str,
+ access_month: str,
+ counter_access_type: str,
+ access_method: str,
+ publication_year: str,
+ metric_scope: str = None,
+) -> str:
+ parts = []
+ if metric_scope:
+ parts.append(metric_scope)
+
+ parts.extend(
+ [
+ str(collection or ""),
+ str(source_key or ""),
+ pid_v2 or "",
+ pid_v3 or "",
+ pid_generic or "",
+ str(access_month or ""),
+ str(counter_access_type or ""),
+ str(access_method or ""),
+ str(publication_year or ""),
+ ]
+ )
+ return "|".join(parts)
+
+
+def generate_year_document_id(
+ collection: str,
+ source_key: str,
+ pid_v2: str,
+ pid_v3: str,
+ pid_generic: str,
+ content_language: str,
+ access_country_code: str,
+ access_year: str,
+ counter_access_type: str,
+ access_method: str,
+ publication_year: str,
+ metric_scope: str = None,
+) -> str:
+ parts = []
+ if metric_scope:
+ parts.append(metric_scope)
+
+ parts.extend(
+ [
+ str(collection or ""),
+ str(source_key or ""),
+ pid_v2 or "",
+ pid_v3 or "",
+ pid_generic or "",
+ content_language or "",
+ access_country_code or "",
+ str(access_year or ""),
+ str(counter_access_type or ""),
+ str(access_method or ""),
+ str(publication_year or ""),
+ ]
+ )
+ return "|".join(parts)
diff --git a/metrics/utils/parser_utils.py b/metrics/counter/parser.py
similarity index 92%
rename from metrics/utils/parser_utils.py
rename to metrics/counter/parser.py
index ef142e6..2081e5d 100644
--- a/metrics/utils/parser_utils.py
+++ b/metrics/counter/parser.py
@@ -1,6 +1,7 @@
import logging
from scielo_usage_counter.translator.classic import URLTranslatorClassicSite
+from scielo_usage_counter.translator.books import URLTranslatorBooksSite
from scielo_usage_counter.translator.dataverse import URLTranslatorDataverseSite
from scielo_usage_counter.translator.opac import URLTranslatorOPACSite
from scielo_usage_counter.translator.opac_alpha import URLTranslatorOPACAlphaSite
@@ -38,6 +39,7 @@ def translator_class_name_to_obj(name: str):
return None
translator_classes = {
+ 'books': URLTranslatorBooksSite,
'classic': URLTranslatorClassicSite,
'dataverse': URLTranslatorDataverseSite,
'opac': URLTranslatorOPACSite,
diff --git a/metrics/es.py b/metrics/es.py
deleted file mode 100644
index 25ad701..0000000
--- a/metrics/es.py
+++ /dev/null
@@ -1,385 +0,0 @@
-import logging
-
-from elasticsearch import Elasticsearch, helpers, NotFoundError
-from django.conf import settings
-
-from .utils import index_utils
-
-
-DEFAULT_ES_INDEX_USAGE_MAPPINGS = {
- "properties": {
- "collection": {
- "type": "keyword"
- },
- "journal": {
- "properties": {
- "scielo_issn": {
- "type": "keyword"
- },
- "main_title": {
- "type": "keyword"
- },
- "subject_area_capes": {
- "type": "keyword"
- },
- "subject_area_wos": {
- "type": "keyword"
- },
- "acronym": {
- "type": "keyword"
- },
- "publisher": {
- "type": "keyword"
- }
- }
- },
- "pid": {
- "type": "keyword"
- },
- "pid_v2": {
- "type": "keyword"
- },
- "pid_v3": {
- "type": "keyword"
- },
- "pid_generic": {
- "type": "keyword"
- },
- "year_of_publication": {
- "type": "integer"
- },
- "media_language": {
- "type": "keyword"
- },
- "country_code": {
- "type": "keyword"
- },
- "date": {
- "type": "date",
- "format": "yyyy-MM-dd"
- },
- "total_requests": {
- "type": "integer"
- },
- "total_investigations": {
- "type": "integer"
- },
- "unique_requests": {
- "type": "integer"
- },
- "unique_investigations": {
- "type": "integer"
- }
- }
-}
-
-
-class ElasticSearchUsageWrapper:
- """
- Wrapper for Elasticsearch usage metrics operations.
- This class provides methods to interact with Elasticsearch for indexing,
- deleting, and managing usage metrics data.
- """
-
- def __init__(self, url=None, basic_auth=None, api_key=None, verify_certs=False):
- self.client = self.get_elasticsearch_client(url, basic_auth, api_key, verify_certs)
-
-
- def get_elasticsearch_client(self, url=None, basic_auth=None, api_key=None, verify_certs=False):
- """
- Create an Elasticsearch client instance using Django settings.
-
- :param url: Elasticsearch URL. If None, it will be taken from Django settings.
- :param basic_auth: Basic authentication credentials. If None, it will be taken from Django settings.
- :param api_key: API key. If None, it will be taken from Django settings.
- :param verify_certs: Whether to verify SSL certificates. If None, it will be taken from Django settings.
- """
- if not url:
- url = getattr(settings, "ES_URL", None)
-
- if not basic_auth:
- basic_auth = getattr(settings, "ES_BASIC_AUTH", None)
-
- if not api_key:
- api_key = getattr(settings, "ES_API_KEY", None)
-
- if not verify_certs:
- verify_certs = getattr(settings, "ES_VERIFY_CERTS", False)
-
- if basic_auth:
- client = Elasticsearch(url, basic_auth=basic_auth, verify_certs=verify_certs)
- elif api_key:
- client = Elasticsearch(url, api_key=api_key, verify_certs=verify_certs)
- else:
- client = Elasticsearch(url, verify_certs=verify_certs)
-
- return client
-
-
- def ping(self):
- """
- Check if the Elasticsearch client is available.
- Returns True if the client is available, False otherwise.
- """
- try:
- return self.client.ping()
- except Exception as e:
- logging.error(f"Error pinging Elasticsearch client: {e}")
- return False
-
-
- def create_index(self, index_name, mappings=None, ping_client=False):
- """
- Create an Elasticsearch index.
-
- :param index_name: Name of the index to create.
- :param mappings: Mappings for the index. If None, default mappings will be used.
- :param ping_client: If True, checks if the Elasticsearch client is available before creating the index.
- """
- if ping_client and not self.ping():
- return
-
- if not mappings:
- mappings = DEFAULT_ES_INDEX_USAGE_MAPPINGS
-
- resp = self.client.indices.create(
- index=index_name,
- mappings=mappings,
- )
- logging.info(f"Index {index_name} created: {resp}")
-
-
- def create_index_if_not_exists(self, index_name, mappings=None, ping_client=False):
- """
- Create an Elasticsearch index if it does not already exist.
-
- :param index_name: Name of the index to create.
- :param mappings: Mappings for the index. If None, default mappings will be used.
- :param ping_client: If True, checks if the Elasticsearch client is available before creating the index.
- """
- if ping_client and not self.ping():
- return
-
- if not self.client.indices.exists(index=index_name):
- self.create_index(index_name, mappings, ping_client)
- else:
- logging.info(f"Index {index_name} already exists. Skipping creation.")
-
-
- def delete_index(self, index_name, ping_client=False):
- """
- Delete an Elasticsearch index.
-
- :param index_name: Name of the index to delete.
- :param ping_client: If True, checks if the Elasticsearch client is available before deleting the index.
- """
- if ping_client and not self.ping():
- return
-
- self.client.indices.delete(index=index_name)
-
-
- def index_document(self, index_name, doc_id, document, ping_client=False):
- """
- Index a document in Elasticsearch.
-
- :param index_name: Name of the index.
- :param doc_id: ID of the document.
- :param document: Document to index.
- :param ping_client: If True, checks if the Elasticsearch client is available before indexing the document.
- """
- if ping_client and not self.ping():
- return
-
- self.client.index(index=index_name, id=doc_id, document=document)
-
-
- def index_documents(self, index_name, documents, ping_client=False):
- """
- Index multiple documents in Elasticsearch.
-
- :param index_name: Name of the index.
- :param documents: Dictionary of documents to index, where keys are document IDs and values are the documents.
- :param ping_client: If True, checks if the Elasticsearch client is available before indexing the documents.
- """
- if ping_client and not self.ping():
- return
-
- helpers.bulk(
- self.client,
- (
- {
- "_index": index_name,
- "_id": doc_id,
- "_source": document,
- }
- for doc_id, document in documents.items()
- ),
- )
-
-
- def delete_document(self, index_name, doc_id, ping_client=False):
- """
- Delete a document from Elasticsearch.
-
- :param index_name: Name of the index.
- :param doc_id: ID of the document to delete.
- :param ping_client: If True, checks if the Elasticsearch client is available before deleting the document.
- """
- if ping_client and not self.ping():
- return
-
- try:
- self.client.delete(index=index_name, id=doc_id)
- except NotFoundError as e:
- logging.error(f"Failed to delete document {doc_id} from Elasticsearch: {e}")
-
-
- def delete_documents(self, index_name, doc_ids, ping_client=False):
- """
- Delete multiple documents from Elasticsearch using bulk.
- :param index_name: Name of the index.
- :param doc_ids: List of document IDs to delete.
- :param ping_client: If True, checks if the Elasticsearch client is available before deleting the documents.
- """
- if ping_client and not self.ping():
- return
-
- actions = (
- {
- "_op_type": "delete",
- "_index": index_name,
- "_id": doc_id,
- }
- for doc_id in doc_ids
- )
-
- try:
- helpers.bulk(self.client, actions)
- except helpers.BulkIndexError as e:
- logging.error(f"BulkIndexError occurred: {e.errors}")
-
-
- def delete_documents_by_key(self, index_name, data, ping_client=False):
- """
- Delete multiple documents from Elasticsearch based on specific key-value pairs.
-
- :param index_name: Name of the index.
- :param data: Dictionary where keys are field names and values are single values or lists of values.
- :param ping_client: If True, checks if the Elasticsearch client is available before deleting the documents.
- """
- if ping_client and not self.ping():
- return
-
- query = {
- "query": {
- "bool": {
- "must": [
- {
- "terms": {
- key: values if isinstance(values, list) else [values]
- }
- }
- for key, values in data.items()
- ]
- }
- }
- }
-
- try:
- self.client.delete_by_query(index=index_name, body=query)
- return True
- except Exception as e:
- logging.error(f"Failed to delete documents: {e}")
-
- return False
-
-
- def fetch_and_update_documents_locally(self, index_name, documents, batch_size=5000, ping_client=False):
- """
- Fetch existing documents from Elasticsearch and update local documents with accumulated metrics.
- This function retrieves documents from Elasticsearch in batches and merges their metric fields
- with the provided local documents. The merge operation adds values for specific metric fields
- or sets them if they don't exist in the local documents.
-
- Args:
- index_name (str): Name of the Elasticsearch index to fetch documents from.
- documents (dict): Dictionary of documents to be updated, where keys are document IDs and values
- are dictionaries containing metric data.
- batch_size (int, optional): Number of documents to fetch in each batch from Elasticsearch.
- Defaults to 5000.
- ping_client (bool, optional): If True, checks if the Elasticsearch client is available before
- fetching documents. Defaults to False.
-
- Returns:
- None: The function modifies the input documents dictionary in-place.
- """
- if ping_client and not self.ping():
- return
-
- existing_docs = {}
- ids = list(documents.keys())
-
- for i in range(0, len(ids), batch_size):
- batch_ids = ids[i:i+batch_size]
- resp = self.client.mget(index=index_name, ids=batch_ids)
- for doc in resp.get('docs', []):
- if doc.get('found'):
- existing_docs[doc['_id']] = doc['_source']
- logging.info(f'Found {len(existing_docs)} existing documents in Elasticsearch for update.')
-
- for doc_id, existing in existing_docs.items():
- current = documents[doc_id]
- for field in [
- "total_requests",
- "unique_requests",
- "total_investigations",
- "unique_investigations",
- ]:
- if field in existing and field in current:
- current[field] += existing[field]
- elif field in existing:
- current[field] = existing[field]
-
-
- def export_to_index(self, index_name, data, batch_size=5000, ping_client=False):
- """
- Export data to Elasticsearch index in bulk operations.
- This function converts input data to index documents, processes them locally,
- and then indexes them to Elasticsearch in batches to optimize performance.
-
- Args:
- index_name (str): Name of the Elasticsearch index to export data to.
- data: The data to be exported to the Elasticsearch index
- batch_size (int, optional): Number of documents to process in each bulk operation.
- Defaults to 5000.
- ping_client (bool, optional): If True, checks if the Elasticsearch client is available
-
- Returns:
- None: Function performs side effects by indexing data to Elasticsearch
- """
- if ping_client and not self.ping():
- return
-
- bulk_data = []
- documents = index_utils.convert_to_index_documents(data)
- self.fetch_and_update_documents_locally(index_name=index_name, documents=documents)
-
- for key, metric_data in documents.items():
- metric_data['pid'] = metric_data.get('pid_v3') or metric_data.get('pid_v2') or metric_data.get('pid_generic', '')
- bulk_data.append({
- "_id": key,
- "_source": metric_data,
- })
-
- if len(bulk_data) >= batch_size:
- self.index_documents(
- index_name=index_name,
- documents={doc["_id"]: doc["_source"] for doc in bulk_data},
- )
- bulk_data = []
-
- self.index_documents(
- index_name=index_name,
- documents={doc["_id"]: doc["_source"] for doc in bulk_data},
- )
diff --git a/metrics/fixtures/top100articles.csv b/metrics/fixtures/top100articles.csv
deleted file mode 100755
index 9d979f3..0000000
--- a/metrics/fixtures/top100articles.csv
+++ /dev/null
@@ -1,97 +0,0 @@
-print_issn online_issn pid_issn collection pid yop year_month_day total_item_requests total_item_investigations unique_item_requests unique_item_investigations
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000300005 2005 2024-05-26 13 16 13 16
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000100020 2009 2024-05-26 9 10 8 9
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200012 2009 2024-05-26 8 9 8 9
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200018 2009 2024-05-26 8 8 8 8
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000300004 2005 2024-05-26 8 11 8 11
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200011 2009 2024-05-26 8 9 8 9
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200001 2009 2024-05-26 7 7 7 7
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200010 2009 2024-05-26 7 9 7 9
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000300007 2005 2024-05-26 7 10 7 10
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200003 2009 2024-05-26 7 9 7 9
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000400010 2008 2024-05-26 7 7 7 7
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000300008 2005 2024-05-26 7 9 7 9
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000400008 2009 2024-05-26 7 7 7 7
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000400009 2006 2024-05-26 6 6 6 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000200009 2006 2024-05-26 6 7 6 7
-0002-7014 1851-8044 0002-7014 arg S0002-70142010000100007 2010 2024-05-26 6 6 6 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142007000300003 2007 2024-05-26 6 6 6 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000100022 2009 2024-05-26 6 6 6 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142010000100006 2010 2024-05-26 6 6 6 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200002 2009 2024-05-26 6 7 6 7
-0002-7014 1851-8044 0002-7014 arg S0002-70142010000100002 2010 2024-05-26 6 6 6 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142007000200014 2007 2024-05-26 6 6 6 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000100021 2009 2024-05-26 6 6 6 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142010000400010 2010 2024-05-26 6 6 6 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142010000200001 2010 2024-05-26 6 6 6 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142010000200002 2010 2024-05-26 6 7 6 7
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200014 2009 2024-05-26 5 6 5 6
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000100014 2009 2024-05-26 5 5 5 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000200009 2005 2024-05-26 5 5 5 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200004 2009 2024-05-26 5 5 5 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000100016 2006 2024-05-26 5 5 5 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000200015 2006 2024-05-26 5 5 5 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142007000300005 2007 2024-05-26 5 5 5 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000300009 2009 2024-05-26 5 5 5 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142010000200010 2010 2024-05-26 4 4 4 4
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000100015 2008 2024-05-26 3 4 3 4
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000300002 2005 2024-05-26 2 5 2 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200015 2009 2024-05-26 2 3 2 3
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000300001 2005 2024-05-26 2 5 2 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000300009 2005 2024-05-26 2 4 2 4
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200005 2009 2024-05-26 2 4 2 4
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200008 2009 2024-05-26 2 3 2 3
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000300006 2005 2024-05-26 2 5 2 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000300010 2005 2024-05-26 2 2 2 2
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000300003 2005 2024-05-26 2 5 2 5
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000300001 2006 2024-05-26 2 2 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000100005 2009 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200016 2009 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000400004 2005 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000100009 2008 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000100014 2005 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200009 2009 2024-05-26 1 2 1 2
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000100019 2006 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200013 2009 2024-05-26 1 3 1 3
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000400007 2008 2024-05-26 1 2 1 2
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000300010 2008 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200006 2009 2024-05-26 1 3 1 3
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000200018 2006 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000400002 2008 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142010000300005 2010 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000200007 2006 2024-05-26 1 3 1 3
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000400004 2006 2024-05-26 1 2 1 2
-0002-7014 1851-8044 0002-7014 arg S0002-70142007000100004 2007 2024-05-26 1 3 1 3
-0002-7014 1851-8044 0002-7014 arg S0002-70142007000200021 2007 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142007000100002 2007 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000100004 2009 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000400004 2009 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000400006 2008 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000400005 2006 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000300006 2008 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000400011 2008 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142007000300001 2007 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142007000100020 2007 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000400002 2006 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000100005 2005 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200017 2009 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000100005 2008 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000200007 2009 2024-05-26 1 4 1 4
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000100023 2009 2024-05-26 1 1 1 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000100008 2008 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000400008 2006 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000400005 2005 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000200006 2006 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000400007 2005 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000200013 2008 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000400003 2006 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142009000400006 2009 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142007000300008 2007 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000200008 2005 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000200006 2008 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000400004 2008 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142005000400006 2005 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142007000300006 2007 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000300003 2006 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142008000100007 2008 2024-05-26 0 1 0 1
-0002-7014 1851-8044 0002-7014 arg S0002-70142006000100009 2006 2024-05-26 0 1 0 1
diff --git a/metrics/fixtures/top100articles.tar.gz b/metrics/fixtures/top100articles.tar.gz
deleted file mode 100644
index cd49556..0000000
Binary files a/metrics/fixtures/top100articles.tar.gz and /dev/null differ
diff --git a/metrics/management/__init__.py b/metrics/management/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/metrics/management/__init__.py
@@ -0,0 +1 @@
+
diff --git a/metrics/management/commands/__init__.py b/metrics/management/commands/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/metrics/management/commands/__init__.py
@@ -0,0 +1 @@
+
diff --git a/metrics/management/commands/export_book_r51_monthly_metrics.py b/metrics/management/commands/export_book_r51_monthly_metrics.py
new file mode 100644
index 0000000..9889387
--- /dev/null
+++ b/metrics/management/commands/export_book_r51_monthly_metrics.py
@@ -0,0 +1,431 @@
+import csv
+import json
+from collections import defaultdict
+from pathlib import Path
+
+from device_detector import DeviceDetector
+from django.core.management.base import BaseCommand, CommandError
+
+from collection.models import Collection
+from document.models import Document
+from metrics.counter import access, documents as index_docs
+from resources.models import MMDB, RobotUserAgent
+from scielo_usage_counter import log_handler, url_translator
+from scielo_usage_counter.translator.books import URLTranslatorBooksSite
+from source.models import Source
+
+
+class Command(BaseCommand):
+ help = (
+ "Generate COUNTER R5.1 monthly book metrics from one or more log files, "
+ "writing item and title CSV outputs."
+ )
+
+ def add_arguments(self, parser):
+ parser.add_argument(
+ "--input",
+ dest="inputs",
+ action="append",
+ required=True,
+ help="Input log file path. Repeat --input for multiple files.",
+ )
+ parser.add_argument(
+ "--item-output",
+ required=True,
+ help="Output CSV path for item-level monthly metrics.",
+ )
+ parser.add_argument(
+ "--title-output",
+ required=True,
+ help="Output CSV path for title-level monthly metrics.",
+ )
+ parser.add_argument(
+ "--summary-output",
+ help="Optional JSON path with parse and totals summary.",
+ )
+ parser.add_argument(
+ "--collection",
+ default="books",
+ help="Collection acronym (default: books).",
+ )
+ parser.add_argument(
+ "--robots-source",
+ choices=sorted(RobotUserAgent.SOURCE_CHOICES),
+ default=RobotUserAgent.SOURCE_ALL,
+ help="Which active robot list to use: all, counter, or scielo.",
+ )
+
+ def handle(self, *args, **options):
+ input_paths = [Path(value).expanduser() for value in options["inputs"]]
+ item_output = Path(options["item_output"]).expanduser()
+ title_output = Path(options["title_output"]).expanduser()
+ summary_output = (
+ Path(options["summary_output"]).expanduser()
+ if options.get("summary_output")
+ else None
+ )
+
+ for path in input_paths:
+ if not path.exists():
+ raise CommandError(f"Input file not found: {path}")
+
+ collection = Collection.objects.filter(acron3=options["collection"]).first()
+ if not collection:
+ raise CommandError(f"Collection not found: {options['collection']}")
+
+ robots_source = options["robots_source"]
+ robots_list = RobotUserAgent.get_patterns(source=robots_source)
+ if not robots_list:
+ raise CommandError(
+ f"No robot user agents found in database for source {robots_source}."
+ )
+
+ mmdb = MMDB.objects.order_by("-created").first()
+ if not mmdb:
+ raise CommandError("No MMDB found in database.")
+
+ parser = log_handler.LogParser(
+ mmdb_data=mmdb.data,
+ robots_list=robots_list,
+ output_mode="dict",
+ )
+ utm = url_translator.URLTranslationManager(
+ documents_metadata=Document.metadata(collection=collection),
+ sources_metadata=Source.metadata(collection=collection),
+ translator=URLTranslatorBooksSite,
+ )
+
+ results = {}
+ parse_summaries = []
+ ua_cache = {}
+
+ for path in input_paths:
+ self.stdout.write(f"Processing {path}...")
+ parse_summaries.append(
+ self._parse_file(
+ path=path,
+ parser=parser,
+ utm=utm,
+ collection=collection,
+ ua_cache=ua_cache,
+ results=results,
+ )
+ )
+
+ monthly_documents = self._build_monthly_documents(results)
+
+ self._write_item_csv(item_output, monthly_documents["item"])
+ self._write_title_csv(title_output, monthly_documents["title"])
+
+ summary = {
+ "robots_source": robots_source,
+ "raw_result_count": len(results),
+ "parse_summaries": parse_summaries,
+ "totals": {
+ "total_item_requests": sum(
+ doc.get("total_requests", 0) for doc in monthly_documents["item"]
+ ),
+ "total_item_investigations": sum(
+ doc.get("total_investigations", 0)
+ for doc in monthly_documents["item"]
+ ),
+ "unique_item_requests": sum(
+ doc.get("unique_requests", 0) for doc in monthly_documents["item"]
+ ),
+ "unique_item_investigations": sum(
+ doc.get("unique_investigations", 0)
+ for doc in monthly_documents["item"]
+ ),
+ "title_total_item_requests": sum(
+ doc.get("total_requests", 0) for doc in monthly_documents["title"]
+ ),
+ "title_total_item_investigations": sum(
+ doc.get("total_investigations", 0)
+ for doc in monthly_documents["title"]
+ ),
+ "unique_title_requests": sum(
+ doc.get("unique_requests", 0) for doc in monthly_documents["title"]
+ ),
+ "unique_title_investigations": sum(
+ doc.get("unique_investigations", 0)
+ for doc in monthly_documents["title"]
+ ),
+ },
+ }
+
+ if summary_output:
+ summary_output.parent.mkdir(parents=True, exist_ok=True)
+ summary_output.write_text(json.dumps(summary, indent=2, sort_keys=True))
+
+ self.stdout.write(self.style.SUCCESS(f"Item CSV written to {item_output}"))
+ self.stdout.write(self.style.SUCCESS(f"Title CSV written to {title_output}"))
+ if summary_output:
+ self.stdout.write(self.style.SUCCESS(f"Summary JSON written to {summary_output}"))
+
+ def _parse_file(self, path, parser, utm, collection, ua_cache, results):
+ stats = defaultdict(int)
+ imported = 0
+
+ with path.open("rb") as fh:
+ for raw_line in fh:
+ stats["lines_parsed"] += 1
+
+ try:
+ line = raw_line.decode().strip()
+ except UnicodeDecodeError:
+ line = raw_line.decode("utf-8", errors="ignore").strip()
+
+ match, ip_value = parser.match_with_best_pattern(line)
+ if not match:
+ stats["total_ignored_lines"] += 1
+ continue
+
+ data = match.groupdict()
+ is_bunny = "unix_ts" in data
+ method = "GET" if is_bunny else data.get("method")
+ status = data.get("status")
+ user_agent = parser.format_user_agent(data.get("user_agent"))
+ url = data.get("path")
+ ip_address = ip_value
+
+ if not parser.has_valid_method(method):
+ stats["ignored_lines_invalid_method"] += 1
+ stats["total_ignored_lines"] += 1
+ continue
+
+ if not parser.has_valid_status(status):
+ if parser.status_is_redirect(status):
+ stats["ignored_lines_http_redirects"] += 1
+ elif parser.status_is_error(status):
+ stats["ignored_lines_http_errors"] += 1
+ stats["total_ignored_lines"] += 1
+ continue
+
+ if parser.user_agent_is_bot(user_agent):
+ stats["ignored_lines_bot"] += 1
+ stats["total_ignored_lines"] += 1
+ continue
+
+ if not parser.has_supported_url(url):
+ stats["ignored_lines_static_resources"] += 1
+ stats["total_ignored_lines"] += 1
+ continue
+
+ if is_bunny:
+ local_datetime = parser.format_date(data.get("unix_ts"), None)
+ country_code = data.get("country") or parser.geoip.ip_to_country_code(
+ ip_address
+ )
+ else:
+ local_datetime = parser.format_date(data.get("date"), data.get("timezone"))
+ country_code = parser.geoip.ip_to_country_code(ip_address)
+
+ if not local_datetime:
+ stats["ignored_lines_invalid_local_datetime"] += 1
+ stats["total_ignored_lines"] += 1
+ continue
+
+ if not country_code:
+ stats["ignored_lines_invalid_country_code"] += 1
+ stats["total_ignored_lines"] += 1
+ continue
+
+ device = ua_cache.get(user_agent)
+ if device is None:
+ try:
+ device = DeviceDetector(user_agent).parse()
+ except ZeroDivisionError:
+ stats["ignored_lines_invalid_user_agent"] += 1
+ stats["total_ignored_lines"] += 1
+ ua_cache[user_agent] = False
+ continue
+ ua_cache[user_agent] = device
+ elif device is False:
+ stats["ignored_lines_invalid_user_agent"] += 1
+ stats["total_ignored_lines"] += 1
+ continue
+
+ client_name = parser.format_client_name(device)
+ client_version = parser.format_client_version(device)
+
+ if not client_name:
+ stats["ignored_lines_invalid_client_name"] += 1
+ stats["total_ignored_lines"] += 1
+ continue
+
+ if not client_version:
+ stats["ignored_lines_invalid_client_version"] += 1
+ stats["total_ignored_lines"] += 1
+ continue
+
+ translated = utm.translate(url)
+ item_access_data = access.extract_item_access_data(
+ collection.acron3,
+ translated,
+ )
+ is_valid, _ = access.is_valid_item_access_data(
+ item_access_data,
+ utm,
+ ignore_utm_validation=True,
+ )
+ if not is_valid:
+ stats["total_ignored_lines"] += 1
+ continue
+
+ access.update_results_with_item_access_data(
+ results,
+ item_access_data,
+ {
+ "client_name": client_name,
+ "client_version": client_version,
+ "ip_address": ip_address,
+ "country_code": country_code,
+ "local_datetime": local_datetime,
+ "url": url,
+ },
+ )
+ imported += 1
+ stats["total_imported_lines"] += 1
+
+ return {"path": str(path), "valid_lines_used": imported, **stats}
+
+ def _build_monthly_documents(self, results):
+ documents = index_docs.convert_raw_results_to_index_documents(results)
+ item_documents = {}
+ title_documents = {}
+
+ for doc in documents["month"].values():
+ year_month = doc.get("access_month", "")
+ scope = doc.get("metric_scope", "item")
+ if scope == "title":
+ key = (
+ year_month,
+ doc.get("title_pid_generic") or doc.get("pid_generic"),
+ doc.get("document_type"),
+ )
+ if key not in title_documents:
+ title_documents[key] = {
+ "year_month": year_month,
+ "title_pid_generic": doc.get("title_pid_generic")
+ or doc.get("pid_generic"),
+ "document_type": doc.get("document_type"),
+ "total_requests": 0,
+ "total_investigations": 0,
+ "unique_requests": 0,
+ "unique_investigations": 0,
+ }
+ title_documents[key]["total_requests"] += doc.get("total_requests", 0)
+ title_documents[key]["total_investigations"] += doc.get(
+ "total_investigations", 0
+ )
+ title_documents[key]["unique_requests"] += doc.get("unique_requests", 0)
+ title_documents[key]["unique_investigations"] += doc.get(
+ "unique_investigations", 0
+ )
+ continue
+
+ key = (
+ year_month,
+ doc.get("title_pid_generic"),
+ doc.get("pid_generic"),
+ doc.get("document_type"),
+ )
+ if key not in item_documents:
+ item_documents[key] = {
+ "year_month": year_month,
+ "title_pid_generic": doc.get("title_pid_generic"),
+ "segment_pid_generic": doc.get("pid_generic"),
+ "document_type": doc.get("document_type"),
+ "total_requests": 0,
+ "total_investigations": 0,
+ "unique_requests": 0,
+ "unique_investigations": 0,
+ }
+ item_documents[key]["total_requests"] += doc.get("total_requests", 0)
+ item_documents[key]["total_investigations"] += doc.get(
+ "total_investigations", 0
+ )
+ item_documents[key]["unique_requests"] += doc.get("unique_requests", 0)
+ item_documents[key]["unique_investigations"] += doc.get(
+ "unique_investigations", 0
+ )
+
+ return {
+ "item": list(item_documents.values()),
+ "title": list(title_documents.values()),
+ }
+
+ @staticmethod
+ def _write_item_csv(path, item_documents):
+ path.parent.mkdir(parents=True, exist_ok=True)
+ with path.open("w", newline="") as fh:
+ writer = csv.DictWriter(
+ fh,
+ fieldnames=[
+ "year_month",
+ "title_pid_generic",
+ "segment_pid_generic",
+ "document_type",
+ "total_item_requests",
+ "total_item_investigations",
+ "unique_item_requests",
+ "unique_item_investigations",
+ ],
+ )
+ writer.writeheader()
+ for doc in sorted(
+ item_documents,
+ key=lambda item: (
+ item.get("year_month", ""),
+ item.get("title_pid_generic") or "",
+ item.get("segment_pid_generic") or "",
+ ),
+ ):
+ writer.writerow(
+ {
+ "year_month": doc.get("year_month", ""),
+ "title_pid_generic": doc.get("title_pid_generic"),
+ "segment_pid_generic": doc.get("segment_pid_generic"),
+ "document_type": doc.get("document_type"),
+ "total_item_requests": doc.get("total_requests", 0),
+ "total_item_investigations": doc.get("total_investigations", 0),
+ "unique_item_requests": doc.get("unique_requests", 0),
+ "unique_item_investigations": doc.get("unique_investigations", 0),
+ }
+ )
+
+ @staticmethod
+ def _write_title_csv(path, title_documents):
+ path.parent.mkdir(parents=True, exist_ok=True)
+ with path.open("w", newline="") as fh:
+ writer = csv.DictWriter(
+ fh,
+ fieldnames=[
+ "year_month",
+ "title_pid_generic",
+ "document_type",
+ "total_item_requests",
+ "total_item_investigations",
+ "unique_title_requests",
+ "unique_title_investigations",
+ ],
+ )
+ writer.writeheader()
+ for doc in sorted(
+ title_documents,
+ key=lambda item: (
+ item.get("year_month", ""),
+ item.get("title_pid_generic") or "",
+ ),
+ ):
+ writer.writerow(
+ {
+ "year_month": doc.get("year_month", ""),
+ "title_pid_generic": doc.get("title_pid_generic"),
+ "document_type": doc.get("document_type"),
+ "total_item_requests": doc.get("total_requests", 0),
+ "total_item_investigations": doc.get("total_investigations", 0),
+ "unique_title_requests": doc.get("unique_requests", 0),
+ "unique_title_investigations": doc.get("unique_investigations", 0),
+ }
+ )
diff --git a/metrics/management/commands/schedule_cleanup_daily_payloads.py b/metrics/management/commands/schedule_cleanup_daily_payloads.py
new file mode 100644
index 0000000..285a23f
--- /dev/null
+++ b/metrics/management/commands/schedule_cleanup_daily_payloads.py
@@ -0,0 +1,68 @@
+from django.core.management.base import BaseCommand
+
+from core.utils.scheduler import schedule_task
+from metrics.tasks import task_cleanup_daily_payloads
+
+
+class Command(BaseCommand):
+ help = (
+ "Schedule the periodic cleanup of exported daily metric payload files. "
+ "Runs weekly on Sunday at 03:00 UTC by default, deleting payload files "
+ "for jobs that were exported more than 7 days ago."
+ )
+
+ def add_arguments(self, parser):
+ parser.add_argument(
+ "--day-of-week",
+ default="0",
+ help="Crontab day of week (0=Sunday, 6=Saturday). Default: 0",
+ )
+ parser.add_argument(
+ "--hour",
+ default="3",
+ help="Crontab hour (0-23). Default: 3",
+ )
+ parser.add_argument(
+ "--minute",
+ default="0",
+ help="Crontab minute (0-59). Default: 0",
+ )
+ parser.add_argument(
+ "--older-than-days",
+ type=int,
+ default=7,
+ help="Only delete payloads exported more than N days ago. Default: 7",
+ )
+ parser.add_argument(
+ "--collection",
+ action="append",
+ dest="collections",
+ help="Limit cleanup to a specific collection acronym. Repeat for multiple.",
+ )
+
+ def handle(self, *args, **options):
+ celery_task_name = task_cleanup_daily_payloads.name
+
+ kwargs = {
+ "older_than_days": options["older_than_days"],
+ "collections": options.get("collections") or [],
+ }
+
+ schedule_task(
+ task=celery_task_name,
+ name=celery_task_name,
+ kwargs=kwargs,
+ description="Weekly cleanup of exported daily payload files from disk.",
+ day_of_week=options["day_of_week"],
+ hour=options["hour"],
+ minute=options["minute"],
+ )
+
+ self.stdout.write(
+ self.style.SUCCESS(
+ f"Scheduled periodic task '{celery_task_name}' "
+ f"(day_of_week={options['day_of_week']}, hour={options['hour']}, "
+ f"minute={options['minute']}, older_than_days={kwargs['older_than_days']}, "
+ f"collections={kwargs['collections'] or 'all'})."
+ )
+ )
diff --git a/metrics/migrations/0001_initial.py b/metrics/migrations/0001_initial.py
index 30ccc96..9746d5f 100644
--- a/metrics/migrations/0001_initial.py
+++ b/metrics/migrations/0001_initial.py
@@ -1,4 +1,4 @@
-# Generated by Django 5.0.7 on 2024-08-30 00:52
+# Generated by Codex on 2026-04-27
import django.db.models.deletion
from django.conf import settings
@@ -9,13 +9,13 @@ class Migration(migrations.Migration):
initial = True
dependencies = [
- ("wagtaildocs", "0013_delete_uploadeddocument"),
+ ("collection", "0001_initial"),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.CreateModel(
- name="Top100ArticlesFile",
+ name="DailyMetricJob",
fields=[
(
"id",
@@ -28,133 +28,85 @@ class Migration(migrations.Migration):
),
(
"created",
- models.DateTimeField(
- auto_now_add=True, verbose_name="Creation date"
- ),
+ models.DateTimeField(auto_now_add=True, verbose_name="Creation date"),
),
(
"updated",
- models.DateTimeField(
- auto_now=True, verbose_name="Last update date"
- ),
+ models.DateTimeField(auto_now=True, verbose_name="Last update date"),
+ ),
+ (
+ "access_date",
+ models.DateField(db_index=True, verbose_name="Access Date"),
),
(
"status",
models.CharField(
choices=[
- ("QUE", "Queued"),
- ("PAR", "Parsing"),
- ("PRO", "Processed"),
- ("INV", "Invalidated"),
+ ("PEN", "Pending"),
+ ("EXP", "Exporting"),
+ ("SUC", "Exported"),
+ ("ERR", "Error"),
],
- default="QUE",
- max_length=5,
+ db_index=True,
+ default="PEN",
+ max_length=3,
+ verbose_name="Status",
),
),
(
- "attachment",
- models.ForeignKey(
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="+",
- to="wagtaildocs.document",
- verbose_name="Attachment",
- ),
+ "input_log_hashes",
+ models.JSONField(default=list, verbose_name="Input Log Hashes"),
),
(
- "creator",
- models.ForeignKey(
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_creator",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Creator",
+ "storage_path",
+ models.CharField(
+ blank=True,
+ default="",
+ max_length=500,
+ verbose_name="Storage Path",
),
),
(
- "updated_by",
- models.ForeignKey(
+ "payload_hash",
+ models.CharField(
blank=True,
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_last_mod_user",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Updater",
+ default="",
+ max_length=64,
+ verbose_name="Payload Hash",
),
),
- ],
- options={
- "verbose_name": "Top 100 Articles File",
- "verbose_name_plural": "Top 100 Articles Files",
- },
- ),
- migrations.CreateModel(
- name="Top100Articles",
- fields=[
(
- "id",
- models.BigAutoField(
- auto_created=True,
- primary_key=True,
- serialize=False,
- verbose_name="ID",
- ),
+ "summary",
+ models.JSONField(blank=True, default=dict, verbose_name="Summary"),
),
(
- "created",
- models.DateTimeField(
- auto_now_add=True, verbose_name="Creation date"
- ),
+ "attempts",
+ models.PositiveIntegerField(default=0, verbose_name="Attempts"),
),
(
- "updated",
- models.DateTimeField(
- auto_now=True, verbose_name="Last update date"
- ),
+ "error_message",
+ models.TextField(blank=True, default="", verbose_name="Error Message"),
),
- ("pid_issn", models.CharField(max_length=9, verbose_name="PID ISSN")),
- ("year_month_day", models.DateField(verbose_name="Date of access")),
(
- "print_issn",
- models.CharField(
- blank=True, max_length=9, null=True, verbose_name="Print ISSN"
+ "export_started_at",
+ models.DateTimeField(
+ blank=True,
+ null=True,
+ verbose_name="Export Started At",
),
),
(
- "online_issn",
- models.CharField(
- blank=True, max_length=9, null=True, verbose_name="Online ISSN"
- ),
+ "exported_at",
+ models.DateTimeField(blank=True, null=True, verbose_name="Exported At"),
),
(
"collection",
- models.CharField(max_length=3, verbose_name="Collection Acronym 3"),
- ),
- ("pid", models.CharField(verbose_name="Publication ID")),
- (
- "yop",
- models.PositiveSmallIntegerField(
- verbose_name="Year of Publication"
+ models.ForeignKey(
+ on_delete=django.db.models.deletion.CASCADE,
+ to="collection.collection",
+ verbose_name="Collection",
),
),
- (
- "total_item_requests",
- models.IntegerField(verbose_name="Total Item Requests"),
- ),
- (
- "total_item_investigations",
- models.IntegerField(verbose_name="Total Item Investigations"),
- ),
- (
- "unique_item_requests",
- models.IntegerField(verbose_name="Unique Item Requests"),
- ),
- (
- "unique_item_investigations",
- models.IntegerField(verbose_name="Unique Item Investigations"),
- ),
(
"creator",
models.ForeignKey(
@@ -180,18 +132,23 @@ class Migration(migrations.Migration):
),
],
options={
- "verbose_name_plural": "Top 100 Articles",
- "indexes": [
- models.Index(
- fields=["pid_issn"], name="metrics_top_pid_iss_c1fba9_idx"
- ),
- models.Index(
- fields=["year_month_day"], name="metrics_top_year_mo_8cda7b_idx"
- ),
- ],
- "unique_together": {
- ("collection", "pid_issn", "pid", "year_month_day")
- },
+ "verbose_name": "Daily Metric Job",
+ "verbose_name_plural": "Daily Metric Jobs",
+ "unique_together": {("collection", "access_date")},
},
),
+ migrations.AddIndex(
+ model_name="dailymetricjob",
+ index=models.Index(
+ fields=["collection", "access_date"],
+ name="metrics_daily_coll_date_idx",
+ ),
+ ),
+ migrations.AddIndex(
+ model_name="dailymetricjob",
+ index=models.Index(
+ fields=["status", "export_started_at"],
+ name="metrics_daily_status_exp_idx",
+ ),
+ ),
]
diff --git a/metrics/migrations/0002_alter_top100articlesfile_status.py b/metrics/migrations/0002_alter_top100articlesfile_status.py
deleted file mode 100644
index b2b98c5..0000000
--- a/metrics/migrations/0002_alter_top100articlesfile_status.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Generated by Django 5.0.7 on 2024-08-30 21:27
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("metrics", "0001_initial"),
- ]
-
- operations = [
- migrations.AlterField(
- model_name="top100articlesfile",
- name="status",
- field=models.CharField(
- choices=[
- ("QUE", "Queued"),
- ("PAR", "Parsing"),
- ("PRO", "Processed"),
- ("ERR", "Error"),
- ("INV", "Invalidated"),
- ],
- default="QUE",
- max_length=5,
- ),
- ),
- ]
diff --git a/metrics/migrations/0003_remove_top100articlesfile_attachment_and_more.py b/metrics/migrations/0003_remove_top100articlesfile_attachment_and_more.py
deleted file mode 100644
index 8b01d80..0000000
--- a/metrics/migrations/0003_remove_top100articlesfile_attachment_and_more.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# Generated by Django 5.0.7 on 2025-03-07 16:55
-
-import django.db.models.deletion
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("article", "0001_initial"),
- ("collection", "0001_initial"),
- ("journal", "0001_initial"),
- ("metrics", "0002_alter_top100articlesfile_status"),
- ]
-
- operations = [
- migrations.RemoveField(
- model_name="top100articlesfile",
- name="attachment",
- ),
- migrations.RemoveField(
- model_name="top100articlesfile",
- name="creator",
- ),
- migrations.RemoveField(
- model_name="top100articlesfile",
- name="updated_by",
- ),
- migrations.CreateModel(
- name="Item",
- fields=[
- (
- "id",
- models.BigAutoField(
- auto_created=True,
- primary_key=True,
- serialize=False,
- verbose_name="ID",
- ),
- ),
- (
- "article",
- models.ForeignKey(
- on_delete=django.db.models.deletion.CASCADE,
- to="article.article",
- verbose_name="Article",
- ),
- ),
- (
- "collection",
- models.ForeignKey(
- on_delete=django.db.models.deletion.CASCADE,
- to="collection.collection",
- verbose_name="Collection",
- ),
- ),
- (
- "journal",
- models.ForeignKey(
- on_delete=django.db.models.deletion.CASCADE,
- to="journal.journal",
- verbose_name="Journal",
- ),
- ),
- ],
- options={
- "verbose_name": "Item",
- "verbose_name_plural": "Items",
- },
- ),
- migrations.CreateModel(
- name="UserAgent",
- fields=[
- (
- "id",
- models.BigAutoField(
- auto_created=True,
- primary_key=True,
- serialize=False,
- verbose_name="ID",
- ),
- ),
- (
- "name",
- models.CharField(
- db_index=True, max_length=255, verbose_name="Name"
- ),
- ),
- (
- "version",
- models.CharField(
- db_index=True, max_length=255, verbose_name="Version"
- ),
- ),
- ],
- options={
- "verbose_name": "User Agent",
- "verbose_name_plural": "User Agents",
- "unique_together": {("name", "version")},
- },
- ),
- migrations.CreateModel(
- name="UserSession",
- fields=[
- (
- "id",
- models.BigAutoField(
- auto_created=True,
- primary_key=True,
- serialize=False,
- verbose_name="ID",
- ),
- ),
- ("datetime", models.DateTimeField(verbose_name="Datetime")),
- (
- "user_ip",
- models.CharField(
- db_index=True, max_length=255, verbose_name="User IP"
- ),
- ),
- (
- "user_agent",
- models.ForeignKey(
- on_delete=django.db.models.deletion.CASCADE,
- to="metrics.useragent",
- verbose_name="User Agent",
- ),
- ),
- ],
- options={
- "verbose_name": "User Session",
- "verbose_name_plural": "User Sessions",
- },
- ),
- migrations.CreateModel(
- name="ItemAccess",
- fields=[
- (
- "id",
- models.BigAutoField(
- auto_created=True,
- primary_key=True,
- serialize=False,
- verbose_name="ID",
- ),
- ),
- (
- "country_code",
- models.CharField(
- db_index=True, max_length=2, verbose_name="Country"
- ),
- ),
- (
- "media_language",
- models.CharField(
- db_index=True, max_length=2, verbose_name="Media Language"
- ),
- ),
- (
- "media_format",
- models.CharField(max_length=10, verbose_name="Media Format"),
- ),
- (
- "item",
- models.ForeignKey(
- on_delete=django.db.models.deletion.CASCADE,
- to="metrics.item",
- verbose_name="Item",
- ),
- ),
- (
- "user_session",
- models.ForeignKey(
- on_delete=django.db.models.deletion.CASCADE,
- to="metrics.usersession",
- verbose_name="User Session",
- ),
- ),
- ],
- options={
- "verbose_name": "Item Access",
- "verbose_name_plural": "Items Access",
- },
- ),
- migrations.DeleteModel(
- name="Top100Articles",
- ),
- ]
diff --git a/metrics/migrations/0004_delete_top100articlesfile_and_more.py b/metrics/migrations/0004_delete_top100articlesfile_and_more.py
deleted file mode 100644
index b10c41b..0000000
--- a/metrics/migrations/0004_delete_top100articlesfile_and_more.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Generated by Django 5.0.7 on 2025-03-07 16:55
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("metrics", "0003_remove_top100articlesfile_attachment_and_more"),
- ("tracker", "0003_logfilediscardedline_delete_top100articlesfileevent"),
- ]
-
- operations = [
- migrations.DeleteModel(
- name="Top100ArticlesFile",
- ),
- migrations.AddIndex(
- model_name="item",
- index=models.Index(
- fields=["collection", "journal", "article"],
- name="metrics_ite_collect_6971a5_idx",
- ),
- ),
- migrations.AddIndex(
- model_name="item",
- index=models.Index(
- fields=["collection", "journal"], name="metrics_ite_collect_b5f79b_idx"
- ),
- ),
- migrations.AlterUniqueTogether(
- name="item",
- unique_together={("collection", "journal", "article")},
- ),
- migrations.AlterUniqueTogether(
- name="usersession",
- unique_together={("datetime", "user_agent", "user_ip")},
- ),
- migrations.AlterUniqueTogether(
- name="itemaccess",
- unique_together={
- (
- "item",
- "user_session",
- "country_code",
- "media_format",
- "media_language",
- )
- },
- ),
- ]
diff --git a/metrics/migrations/0005_alter_itemaccess_unique_together_and_more.py b/metrics/migrations/0005_alter_itemaccess_unique_together_and_more.py
deleted file mode 100644
index 7bfafff..0000000
--- a/metrics/migrations/0005_alter_itemaccess_unique_together_and_more.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Generated by Django 5.0.7 on 2025-03-27 20:40
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("metrics", "0004_delete_top100articlesfile_and_more"),
- ]
-
- operations = [
- migrations.AlterUniqueTogether(
- name="itemaccess",
- unique_together=set(),
- ),
- migrations.AddField(
- model_name="itemaccess",
- name="click_timestamps",
- field=models.JSONField(default=dict, verbose_name="Click Timestamps"),
- ),
- migrations.AddField(
- model_name="itemaccess",
- name="content_type",
- field=models.CharField(
- default="undefined", max_length=16, verbose_name="Content Type"
- ),
- preserve_default=False,
- ),
- migrations.AlterField(
- model_name="itemaccess",
- name="media_format",
- field=models.CharField(
- db_index=True, max_length=10, verbose_name="Media Format"
- ),
- ),
- migrations.AlterUniqueTogether(
- name="itemaccess",
- unique_together={
- (
- "item",
- "user_session",
- "country_code",
- "media_format",
- "media_language",
- "content_type",
- )
- },
- ),
- ]
diff --git a/metrics/migrations/0006_alter_itemaccess_content_type.py b/metrics/migrations/0006_alter_itemaccess_content_type.py
deleted file mode 100644
index 0e81287..0000000
--- a/metrics/migrations/0006_alter_itemaccess_content_type.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Generated by Django 5.0.7 on 2025-03-31 21:07
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("metrics", "0005_alter_itemaccess_unique_together_and_more"),
- ]
-
- operations = [
- migrations.AlterField(
- model_name="itemaccess",
- name="content_type",
- field=models.CharField(max_length=32, verbose_name="Content Type"),
- ),
- ]
diff --git a/metrics/migrations/0007_alter_usersession_datetime_and_more.py b/metrics/migrations/0007_alter_usersession_datetime_and_more.py
deleted file mode 100644
index e45036e..0000000
--- a/metrics/migrations/0007_alter_usersession_datetime_and_more.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Generated by Django 5.0.7 on 2025-06-12 17:16
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("metrics", "0006_alter_itemaccess_content_type"),
- ]
-
- operations = [
- migrations.AlterField(
- model_name="usersession",
- name="datetime",
- field=models.DateTimeField(db_index=True, verbose_name="Datetime"),
- ),
- migrations.AddIndex(
- model_name="itemaccess",
- index=models.Index(
- fields=["item", "user_session"], name="metrics_ite_item_id_8799c9_idx"
- ),
- ),
- ]
diff --git a/metrics/migrations/0008_remove_a_few_models.py b/metrics/migrations/0008_remove_a_few_models.py
deleted file mode 100644
index dfd14ec..0000000
--- a/metrics/migrations/0008_remove_a_few_models.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Generated by Django 5.0.7 on 2025-06-22 17:45
-
-from django.db import migrations
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("metrics", "0007_alter_usersession_datetime_and_more"),
- ]
-
- operations = [
- migrations.AlterUniqueTogether(
- name="itemaccess",
- unique_together=None,
- ),
- migrations.AlterUniqueTogether(
- name="useragent",
- unique_together=None,
- ),
- migrations.AlterUniqueTogether(
- name="usersession",
- unique_together=None,
- ),
- migrations.RemoveField(
- model_name="itemaccess",
- name="user_session",
- ),
- migrations.RemoveField(
- model_name="usersession",
- name="user_agent",
- ),
- migrations.RemoveField(
- model_name="itemaccess",
- name="item",
- ),
- migrations.DeleteModel(
- name="Item",
- ),
- migrations.DeleteModel(
- name="ItemAccess",
- ),
- migrations.DeleteModel(
- name="UserAgent",
- ),
- migrations.DeleteModel(
- name="UserSession",
- ),
- ]
diff --git a/metrics/models.py b/metrics/models.py
index e69de29..aa789b5 100644
--- a/metrics/models.py
+++ b/metrics/models.py
@@ -0,0 +1,108 @@
+from django.db import models
+from django.utils.translation import gettext_lazy as _
+
+from collection.models import Collection
+from core.models import CommonControlField
+
+
+class DailyMetricJob(CommonControlField):
+ STATUS_PENDING = "PEN"
+ STATUS_EXPORTING = "EXP"
+ STATUS_EXPORTED = "SUC"
+ STATUS_ERROR = "ERR"
+ STATUS_CHOICES = (
+ (STATUS_PENDING, _("Pending")),
+ (STATUS_EXPORTING, _("Exporting")),
+ (STATUS_EXPORTED, _("Exported")),
+ (STATUS_ERROR, _("Error")),
+ )
+
+ collection = models.ForeignKey(
+ Collection,
+ verbose_name=_("Collection"),
+ on_delete=models.CASCADE,
+ db_index=True,
+ )
+
+ access_date = models.DateField(
+ verbose_name=_("Access Date"),
+ db_index=True,
+ )
+
+ status = models.CharField(
+ verbose_name=_("Status"),
+ max_length=3,
+ choices=STATUS_CHOICES,
+ default=STATUS_PENDING,
+ db_index=True,
+ )
+
+ input_log_hashes = models.JSONField(
+ verbose_name=_("Input Log Hashes"),
+ default=list,
+ )
+
+ storage_path = models.CharField(
+ verbose_name=_("Storage Path"),
+ max_length=500,
+ blank=True,
+ default="",
+ )
+
+ payload_hash = models.CharField(
+ verbose_name=_("Payload Hash"),
+ max_length=64,
+ blank=True,
+ default="",
+ )
+
+ summary = models.JSONField(
+ verbose_name=_("Summary"),
+ default=dict,
+ blank=True,
+ )
+
+ attempts = models.PositiveIntegerField(
+ verbose_name=_("Attempts"),
+ default=0,
+ )
+
+ error_message = models.TextField(
+ verbose_name=_("Error Message"),
+ blank=True,
+ default="",
+ )
+
+ export_started_at = models.DateTimeField(
+ verbose_name=_("Export Started At"),
+ null=True,
+ blank=True,
+ )
+
+ exported_at = models.DateTimeField(
+ verbose_name=_("Exported At"),
+ null=True,
+ blank=True,
+ )
+
+ @property
+ def input_log_count(self):
+ return len(self.input_log_hashes or [])
+
+ @property
+ def job_id(self):
+ if not self.payload_hash:
+ return ""
+ return f"{self.collection.acron3}|{self.access_date.isoformat()}|{self.payload_hash}"
+
+ class Meta:
+ verbose_name = _("Daily Metric Job")
+ verbose_name_plural = _("Daily Metric Jobs")
+ unique_together = (("collection", "access_date"),)
+ indexes = [
+ models.Index(fields=["collection", "access_date"], name="metrics_daily_coll_date_idx"),
+ models.Index(fields=["status", "export_started_at"], name="metrics_daily_status_exp_idx"),
+ ]
+
+ def __str__(self):
+ return f"{self.collection.acron3}-{self.access_date}"
diff --git a/metrics/opensearch/__init__.py b/metrics/opensearch/__init__.py
new file mode 100644
index 0000000..fb9df20
--- /dev/null
+++ b/metrics/opensearch/__init__.py
@@ -0,0 +1,8 @@
+from .client import OpenSearchUsageClient
+from .mappings import (
+ BOOKS_MONTH_INDEX_MAPPINGS,
+ BOOKS_YEAR_INDEX_MAPPINGS,
+ MONTH_INDEX_MAPPINGS,
+ YEAR_INDEX_MAPPINGS,
+ get_index_mappings,
+)
diff --git a/metrics/opensearch/client.py b/metrics/opensearch/client.py
new file mode 100644
index 0000000..ce0de5c
--- /dev/null
+++ b/metrics/opensearch/client.py
@@ -0,0 +1,225 @@
+import logging
+
+from django.conf import settings
+from opensearchpy import NotFoundError, OpenSearch, helpers
+
+from metrics.opensearch.names import generate_month_index_name, generate_year_index_name
+
+from .mappings import get_index_mappings
+from .scripts import (
+ IDEMPOTENT_JOB_INCREMENT_SCRIPT,
+ METRIC_FIELDS,
+ build_idempotent_job_increment_action,
+ merge_metric_document,
+)
+
+
+class OpenSearchUsageClient:
+ def __init__(self, url=None, basic_auth=None, api_key=None, verify_certs=None):
+ self.client = self.get_opensearch_client(url, basic_auth, api_key, verify_certs)
+
+ def get_opensearch_client(self, url=None, basic_auth=None, api_key=None, verify_certs=None):
+ url = url or getattr(settings, "OPENSEARCH_URL", None)
+ basic_auth = basic_auth or getattr(settings, "OPENSEARCH_BASIC_AUTH", None)
+ api_key = api_key or getattr(settings, "OPENSEARCH_API_KEY", None)
+ if verify_certs is None:
+ verify_certs = getattr(settings, "OPENSEARCH_VERIFY_CERTS", False)
+
+ if basic_auth:
+ return OpenSearch(url, http_auth=tuple(basic_auth), verify_certs=verify_certs)
+ if api_key:
+ return OpenSearch(url, api_key=api_key, verify_certs=verify_certs)
+ return OpenSearch(url, verify_certs=verify_certs)
+
+ def ping(self):
+ try:
+ return self.client.ping()
+ except Exception as exc:
+ logging.error("Error pinging OpenSearch client: %s", exc)
+ return False
+
+ def create_index(self, index_name, mappings, ping_client=False):
+ if ping_client and not self.ping():
+ return
+
+ response = self.client.indices.create(
+ index=index_name,
+ body={
+ "settings": {"index": {"number_of_replicas": 0}},
+ "mappings": mappings,
+ },
+ )
+ logging.info("Index %s created: %s", index_name, response)
+
+ def create_index_if_not_exists(self, index_name, mappings, ping_client=False):
+ if ping_client and not self.ping():
+ return
+
+ if not self.client.indices.exists(index=index_name):
+ self.create_index(index_name=index_name, mappings=mappings, ping_client=False)
+
+ def ensure_usage_indexes(self, collection, access_date, index_prefix=None):
+ index_prefix = index_prefix or getattr(settings, "OPENSEARCH_INDEX_NAME", "usage")
+ year_index = generate_year_index_name(index_prefix, collection, access_date)
+ month_index = generate_month_index_name(index_prefix, collection, access_date)
+
+ self.create_index_if_not_exists(year_index, get_index_mappings(collection, "year"))
+ self.create_index_if_not_exists(month_index, get_index_mappings(collection, "month"))
+
+ return {"year": year_index, "month": month_index}
+
+ def delete_index(self, index_name, ping_client=False):
+ if ping_client and not self.ping():
+ return
+ self.client.indices.delete(index=index_name)
+
+ def index_documents(self, index_name, documents, ping_client=False):
+ if ping_client and not self.ping():
+ return
+
+ if not documents:
+ return
+
+ helpers.bulk(
+ self.client,
+ (
+ {"_index": index_name, "_id": doc_id, "_source": document}
+ for doc_id, document in documents.items()
+ ),
+ )
+
+ def increment_documents_for_daily_job(
+ self,
+ index_name,
+ documents,
+ job_id,
+ ping_client=False,
+ ):
+ if ping_client and not self.ping():
+ return
+
+ if not documents:
+ return
+
+ helpers.bulk(
+ self.client,
+ (
+ build_idempotent_job_increment_action(
+ index_name=index_name,
+ doc_id=doc_id,
+ document=document,
+ job_id=job_id,
+ )
+ for doc_id, document in documents.items()
+ ),
+ )
+
+ def delete_documents(self, index_name, doc_ids, ping_client=False):
+ if ping_client and not self.ping():
+ return
+
+ if not doc_ids:
+ return
+
+ helpers.bulk(
+ self.client,
+ (
+ {"_op_type": "delete", "_index": index_name, "_id": doc_id}
+ for doc_id in doc_ids
+ ),
+ )
+
+ def delete_documents_by_key(self, index_name, data, ping_client=False):
+ if ping_client and not self.ping():
+ return False
+
+ query = {
+ "query": {
+ "bool": {
+ "must": [
+ {
+ "terms": {
+ key: values if isinstance(values, list) else [values],
+ }
+ }
+ for key, values in data.items()
+ ]
+ }
+ }
+ }
+
+ try:
+ self.client.delete_by_query(index=index_name, body=query)
+ return True
+ except Exception as exc:
+ logging.error("Failed to delete documents from %s: %s", index_name, exc)
+ return False
+
+ def fetch_documents_by_ids(self, index_name, doc_ids, ping_client=False):
+ if ping_client and not self.ping():
+ return {}
+
+ if not doc_ids:
+ return {}
+
+ try:
+ response = self.client.mget(index=index_name, body={"ids": doc_ids})
+ except NotFoundError:
+ return {}
+
+ documents = {}
+ for document in response.get("docs", []):
+ if document.get("found"):
+ documents[document["_id"]] = document["_source"]
+ return documents
+
+ def fetch_documents_by_key(self, index_name, data, ping_client=False):
+ if ping_client and not self.ping():
+ return {}
+
+ query = {
+ "query": {
+ "bool": {
+ "must": [
+ {
+ "terms": {
+ key: values if isinstance(values, list) else [values],
+ }
+ }
+ for key, values in data.items()
+ ]
+ }
+ }
+ }
+
+ try:
+ return {
+ hit["_id"]: hit["_source"]
+ for hit in helpers.scan(self.client, index=index_name, query=query)
+ }
+ except NotFoundError:
+ return {}
+
+ def sync_documents(self, index_name, documents, operation="add", ping_client=False):
+ if ping_client and not self.ping():
+ return
+
+ if not documents:
+ return
+
+ existing_documents = self.fetch_documents_by_ids(index_name=index_name, doc_ids=list(documents.keys()))
+ upserts = {}
+ deletes = []
+
+ for doc_id, document in documents.items():
+ merged = merge_metric_document(existing_documents.get(doc_id), document, operation=operation)
+ if merged is None:
+ if doc_id in existing_documents:
+ deletes.append(doc_id)
+ continue
+ upserts[doc_id] = merged
+
+ if upserts:
+ self.index_documents(index_name=index_name, documents=upserts)
+ if deletes:
+ self.delete_documents(index_name=index_name, doc_ids=deletes)
diff --git a/metrics/opensearch/mappings.py b/metrics/opensearch/mappings.py
new file mode 100644
index 0000000..5825c1b
--- /dev/null
+++ b/metrics/opensearch/mappings.py
@@ -0,0 +1,177 @@
+YEAR_INDEX_MAPPINGS = {
+ "properties": {
+ "collection": {"type": "keyword"},
+ "source": {
+ "properties": {
+ "source_type": {"type": "keyword"},
+ "source_id": {"type": "keyword"},
+ "scielo_issn": {"type": "keyword"},
+ "main_title": {
+ "type": "text",
+ "fields": {
+ "keyword": {
+ "type": "keyword",
+ "ignore_above": 512
+ }
+ }
+ },
+ "subject_area_capes": {"type": "keyword"},
+ "subject_area_wos": {"type": "keyword"},
+ "acronym": {"type": "keyword"},
+ "publisher_name": {"type": "keyword"},
+ "access_type": {"type": "keyword"},
+ "city": {"type": "keyword"},
+ "country": {"type": "keyword"},
+ "identifiers": {"type": "object"},
+ }
+ },
+ "document_type": {"type": "keyword"},
+ "scielo_document_type": {"type": "keyword"},
+ "metric_scope": {"type": "keyword"},
+ "counter_data_type": {"type": "keyword"},
+ "parent_data_type": {"type": "keyword"},
+ "article_version": {"type": "keyword"},
+ "pid": {"type": "keyword"},
+ "pid_v2": {"type": "keyword"},
+ "pid_v3": {"type": "keyword"},
+ "pid_generic": {"type": "keyword"},
+ "publication_year": {"type": "integer"},
+ "counter_access_type": {"type": "keyword"},
+ "access_method": {"type": "keyword"},
+ "access_year": {"type": "date", "format": "yyyy"},
+ "access_country_code": {"type": "keyword"},
+ "content_language": {"type": "keyword"},
+ "applied_jobs": {"type": "keyword", "index": False},
+ "total_requests": {"type": "integer"},
+ "total_investigations": {"type": "integer"},
+ "unique_requests": {"type": "integer"},
+ "unique_investigations": {"type": "integer"},
+ }
+}
+
+
+MONTH_INDEX_MAPPINGS = {
+ "properties": {
+ "collection": {"type": "keyword"},
+ "source": YEAR_INDEX_MAPPINGS["properties"]["source"],
+ "document_type": {"type": "keyword"},
+ "scielo_document_type": {"type": "keyword"},
+ "metric_scope": {"type": "keyword"},
+ "counter_data_type": {"type": "keyword"},
+ "parent_data_type": {"type": "keyword"},
+ "article_version": {"type": "keyword"},
+ "pid": {"type": "keyword"},
+ "pid_v2": {"type": "keyword"},
+ "pid_v3": {"type": "keyword"},
+ "pid_generic": {"type": "keyword"},
+ "publication_year": {"type": "integer"},
+ "counter_access_type": {"type": "keyword"},
+ "access_method": {"type": "keyword"},
+ "access_month": {"type": "date", "format": "yyyy-MM"},
+ "applied_jobs": {"type": "keyword", "index": False},
+ "daily_metrics": {"type": "object", "dynamic": True},
+ "total_requests": {"type": "integer"},
+ "total_investigations": {"type": "integer"},
+ "unique_requests": {"type": "integer"},
+ "unique_investigations": {"type": "integer"},
+ }
+}
+
+
+BOOKS_YEAR_INDEX_MAPPINGS = {
+ "properties": {
+ "collection": {"type": "keyword"},
+ "source": {
+ "properties": {
+ "source_type": {"type": "keyword"},
+ "source_id": {"type": "keyword"},
+ "main_title": {
+ "type": "text",
+ "fields": {
+ "keyword": {
+ "type": "keyword",
+ "ignore_above": 512
+ }
+ }
+ },
+ "access_type": {"type": "keyword"},
+ "publisher": {"type": "keyword"},
+ "city": {"type": "keyword"},
+ "country": {"type": "keyword"},
+ "identifiers": {
+ "properties": {
+ "book_id": {"type": "keyword"},
+ "isbn": {"type": "keyword"},
+ "eisbn": {"type": "keyword"},
+ "doi": {"type": "keyword"},
+ }
+ },
+ }
+ },
+ "document_type": {"type": "keyword"},
+ "scielo_document_type": {"type": "keyword"},
+ "metric_scope": {"type": "keyword"},
+ "counter_data_type": {"type": "keyword"},
+ "parent_data_type": {"type": "keyword"},
+ "article_version": {"type": "keyword"},
+ "pid": {"type": "keyword"},
+ "pid_generic": {"type": "keyword"},
+ "title_pid_generic": {"type": "keyword"},
+ "publication_year": {"type": "integer"},
+ "counter_access_type": {"type": "keyword"},
+ "access_method": {"type": "keyword"},
+ "access_year": {"type": "date", "format": "yyyy"},
+ "access_country_code": {"type": "keyword"},
+ "content_language": {"type": "keyword"},
+ "applied_jobs": {"type": "keyword", "index": False},
+ "total_requests": {"type": "integer"},
+ "total_investigations": {"type": "integer"},
+ "unique_requests": {"type": "integer"},
+ "unique_investigations": {"type": "integer"},
+ }
+}
+
+
+BOOKS_MONTH_INDEX_MAPPINGS = {
+ "properties": {
+ "collection": {"type": "keyword"},
+ "source": BOOKS_YEAR_INDEX_MAPPINGS["properties"]["source"],
+ "document_type": {"type": "keyword"},
+ "scielo_document_type": {"type": "keyword"},
+ "metric_scope": {"type": "keyword"},
+ "counter_data_type": {"type": "keyword"},
+ "parent_data_type": {"type": "keyword"},
+ "article_version": {"type": "keyword"},
+ "pid": {"type": "keyword"},
+ "pid_generic": {"type": "keyword"},
+ "title_pid_generic": {"type": "keyword"},
+ "publication_year": {"type": "integer"},
+ "counter_access_type": {"type": "keyword"},
+ "access_method": {"type": "keyword"},
+ "access_month": {"type": "date", "format": "yyyy-MM"},
+ "applied_jobs": {"type": "keyword", "index": False},
+ "daily_metrics": {"type": "object", "dynamic": True},
+ "total_requests": {"type": "integer"},
+ "total_investigations": {"type": "integer"},
+ "unique_requests": {"type": "integer"},
+ "unique_investigations": {"type": "integer"},
+ }
+}
+
+
+METRIC_FIELDS = (
+ "total_requests",
+ "total_investigations",
+ "unique_requests",
+ "unique_investigations",
+)
+
+
+def get_index_mappings(collection, granularity):
+ if granularity not in {"month", "year"}:
+ raise ValueError("Granularity must be 'month' or 'year'.")
+
+ if collection == "books":
+ return BOOKS_MONTH_INDEX_MAPPINGS if granularity == "month" else BOOKS_YEAR_INDEX_MAPPINGS
+
+ return MONTH_INDEX_MAPPINGS if granularity == "month" else YEAR_INDEX_MAPPINGS
diff --git a/metrics/opensearch/names.py b/metrics/opensearch/names.py
new file mode 100644
index 0000000..1ecd493
--- /dev/null
+++ b/metrics/opensearch/names.py
@@ -0,0 +1,41 @@
+from django.conf import settings
+
+
+def _validate_index_inputs(index_prefix: str, collection: str, date: str):
+ if not date or not isinstance(date, str):
+ raise ValueError("Date must be a non-empty string in 'YYYY-MM-DD' format.")
+ if not collection or not isinstance(collection, str):
+ raise ValueError("Collection must be a non-empty string.")
+ if not index_prefix or not isinstance(index_prefix, str):
+ raise ValueError("Index prefix must be a non-empty string.")
+
+
+def _get_collection_size(collection: str) -> str:
+ return getattr(settings, "COLLECTION_ACRON3_SIZE_MAP", {}).get(collection, "small")
+
+
+def extract_access_year(date: str) -> str:
+ _validate_index_inputs("usage", "tmp", date)
+ return date.split("-")[0]
+
+
+def extract_access_month(date: str) -> str:
+ _validate_index_inputs("usage", "tmp", date)
+ year, month, _ = date.split("-")
+ return f"{year}{month}"
+
+
+def generate_month_index_name(index_prefix: str, collection: str, date: str) -> str:
+ _validate_index_inputs(index_prefix, collection, date)
+ size = _get_collection_size(collection)
+ if size in ("xlarge", "large"):
+ return f"{index_prefix}_monthly_{collection}_{extract_access_year(date)}"
+ return f"{index_prefix}_monthly_{collection}"
+
+
+def generate_year_index_name(index_prefix: str, collection: str, date: str) -> str:
+ _validate_index_inputs(index_prefix, collection, date)
+ size = _get_collection_size(collection)
+ if size in ("xlarge", "large"):
+ return f"{index_prefix}_yearly_{collection}_{extract_access_year(date)}"
+ return f"{index_prefix}_yearly_{collection}"
diff --git a/metrics/opensearch/scripts.py b/metrics/opensearch/scripts.py
new file mode 100644
index 0000000..a6a5e1c
--- /dev/null
+++ b/metrics/opensearch/scripts.py
@@ -0,0 +1,102 @@
+METRIC_FIELDS = (
+ "total_requests",
+ "total_investigations",
+ "unique_requests",
+ "unique_investigations",
+)
+
+IDEMPOTENT_JOB_INCREMENT_SCRIPT = """
+if (ctx._source.applied_jobs == null) {
+ ctx._source.applied_jobs = [];
+}
+if (ctx._source.applied_jobs.contains(params.job_id)) {
+ ctx.op = 'none';
+ return;
+}
+for (entry in params.document.entrySet()) {
+ if (!params.metric_fields.contains(entry.getKey()) && !'applied_jobs'.equals(entry.getKey()) && !'daily_metrics'.equals(entry.getKey())) {
+ if (!ctx._source.containsKey(entry.getKey()) || ctx._source[entry.getKey()] != entry.getValue()) {
+ ctx._source[entry.getKey()] = entry.getValue();
+ }
+ }
+}
+for (field in params.metric_fields) {
+ def currentValue = ctx._source.containsKey(field) ? ctx._source[field] : 0;
+ def increment = params.document.containsKey(field) ? params.document[field] : 0;
+ ctx._source[field] = currentValue + increment;
+}
+if (params.document.containsKey('daily_metrics')) {
+ if (!ctx._source.containsKey('daily_metrics') || ctx._source.daily_metrics == null) {
+ ctx._source.daily_metrics = new HashMap();
+ }
+ for (dayEntry in params.document.daily_metrics.entrySet()) {
+ def day = dayEntry.getKey();
+ def dayMetrics = dayEntry.getValue();
+ if (!ctx._source.daily_metrics.containsKey(day) || ctx._source.daily_metrics[day] == null) {
+ ctx._source.daily_metrics[day] = new HashMap();
+ }
+ for (metric in params.metric_fields) {
+ def currentValue = ctx._source.daily_metrics[day].containsKey(metric) ? ctx._source.daily_metrics[day][metric] : 0;
+ def increment = dayMetrics.containsKey(metric) ? dayMetrics[metric] : 0;
+ ctx._source.daily_metrics[day][metric] = currentValue + increment;
+ }
+ }
+}
+ctx._source.applied_jobs.add(params.job_id);
+"""
+
+
+def build_idempotent_job_increment_action(index_name, doc_id, document, job_id):
+ return {
+ "_op_type": "update",
+ "_index": index_name,
+ "_id": doc_id,
+ "retry_on_conflict": 5,
+ "scripted_upsert": True,
+ "script": {
+ "lang": "painless",
+ "source": IDEMPOTENT_JOB_INCREMENT_SCRIPT,
+ "params": {
+ "document": document,
+ "job_id": job_id,
+ "metric_fields": list(METRIC_FIELDS),
+ },
+ },
+ "upsert": {
+ "applied_jobs": [],
+ },
+ }
+
+
+def merge_metric_document(existing, current, operation="add"):
+ if existing is None:
+ if operation == "subtract":
+ return None
+ return current
+
+ merged = dict(existing)
+ merged.update(
+ {
+ key: value
+ for key, value in current.items()
+ if key not in METRIC_FIELDS and key != "daily_metrics"
+ }
+ )
+
+ signal = -1 if operation == "subtract" else 1
+ for field in METRIC_FIELDS:
+ merged[field] = existing.get(field, 0) + signal * current.get(field, 0)
+
+ if "daily_metrics" in current:
+ merged_daily = dict(existing.get("daily_metrics") or {})
+ for day, metrics in current["daily_metrics"].items():
+ day_merged = dict(merged_daily.get(day) or {})
+ for field in METRIC_FIELDS:
+ day_merged[field] = day_merged.get(field, 0) + signal * metrics.get(field, 0)
+ merged_daily[day] = day_merged
+ merged["daily_metrics"] = merged_daily
+
+ if all(merged.get(field, 0) <= 0 for field in METRIC_FIELDS):
+ return None
+
+ return merged
diff --git a/metrics/services/__init__.py b/metrics/services/__init__.py
new file mode 100644
index 0000000..b305681
--- /dev/null
+++ b/metrics/services/__init__.py
@@ -0,0 +1,26 @@
+from .jobs import (
+ acquire_daily_metric_job,
+ create_or_update_daily_metric_job,
+ mark_daily_metric_job_exported,
+ mark_daily_metric_job_failed,
+ release_stale_daily_metric_jobs,
+)
+from .resources import (
+ build_search_client,
+ extract_celery_queue_name,
+ fetch_required_resources,
+ get_log_files_for_collection_date,
+)
+from .parser import (
+ is_stale_parsing_log,
+ process_daily_metric_job,
+ process_line,
+ requeue_stale_parsing_log,
+ setup_parsing_environment,
+ touch_parse_heartbeat,
+)
+from .export import (
+ export_daily_metric_payload,
+ export_documents,
+ load_daily_metric_payload,
+)
diff --git a/metrics/services/daily_payloads.py b/metrics/services/daily_payloads.py
new file mode 100644
index 0000000..0e06af9
--- /dev/null
+++ b/metrics/services/daily_payloads.py
@@ -0,0 +1,127 @@
+import hashlib
+import json
+import logging
+import os
+from datetime import timedelta
+from pathlib import Path
+
+from django.conf import settings
+from django.utils import timezone
+
+
+def get_daily_payload_root():
+ return Path(settings.MEDIA_ROOT) / "metrics" / "daily_payloads"
+
+
+def build_daily_storage_path(collection, access_date):
+ return (
+ Path(collection.acron3)
+ / access_date.strftime("%Y")
+ / access_date.strftime("%m")
+ / f"{access_date.isoformat()}.json"
+ )
+
+
+def resolve_storage_path(storage_path):
+ return get_daily_payload_root() / storage_path
+
+
+def serialize_payload(payload):
+ return json.dumps(
+ payload,
+ ensure_ascii=True,
+ sort_keys=True,
+ separators=(",", ":"),
+ )
+
+
+def write_payload(storage_path, payload):
+ resolved_path = resolve_storage_path(storage_path)
+ resolved_path.parent.mkdir(parents=True, exist_ok=True)
+
+ payload_json = serialize_payload(payload)
+ payload_hash = hashlib.sha256(payload_json.encode("utf-8")).hexdigest()
+
+ tmp_path = resolved_path.with_suffix(f"{resolved_path.suffix}.tmp")
+ tmp_path.write_text(payload_json, encoding="utf-8")
+ tmp_path.replace(resolved_path)
+
+ return payload_hash
+
+
+def read_payload(storage_path):
+ resolved_path = resolve_storage_path(storage_path)
+ return json.loads(resolved_path.read_text(encoding="utf-8"))
+
+
+def delete_payload(storage_path):
+ resolved_path = resolve_storage_path(storage_path)
+ if resolved_path.exists():
+ resolved_path.unlink()
+
+
+def cleanup_exported_payloads(collections=None, older_than_days=7):
+ from metrics.models import DailyMetricJob
+
+ root = get_daily_payload_root()
+ if not root.exists():
+ return 0
+
+ cutoff = timezone.now() - timedelta(days=older_than_days) if older_than_days and older_than_days > 0 else None
+
+ storage_path_to_job = {}
+ db_queryset = DailyMetricJob.objects.exclude(storage_path="")
+ if collections:
+ db_queryset = db_queryset.filter(collection__acron3__in=collections)
+ for job in db_queryset.iterator(chunk_size=500):
+ storage_path_to_job[job.storage_path] = job
+
+ json_files = root.rglob("*.json")
+ if collections:
+ json_files = [p for p in json_files if p.relative_to(root).parts[0] in collections]
+
+ deleted_count = 0
+ for file_path in json_files:
+ if cutoff and _file_is_recent(file_path, cutoff):
+ continue
+
+ storage_path = file_path.relative_to(root).as_posix()
+ job = storage_path_to_job.get(storage_path)
+
+ if job is not None and job.status != DailyMetricJob.STATUS_EXPORTED:
+ continue
+
+ try:
+ file_path.unlink()
+ except FileNotFoundError:
+ pass
+ deleted_count += 1
+
+ if job is not None:
+ job.storage_path = ""
+ job.payload_hash = ""
+ job.save(update_fields=["storage_path", "payload_hash", "updated"])
+
+ _cleanup_empty_dirs(root)
+
+ logging.info(
+ "Cleaned up %s daily payload files (collections=%s, older_than_days=%s).",
+ deleted_count,
+ collections or "all",
+ older_than_days,
+ )
+ return deleted_count
+
+
+def _file_is_recent(file_path, cutoff):
+ return file_path.stat().st_mtime >= cutoff.timestamp()
+
+
+def _cleanup_empty_dirs(root):
+ for dirpath, dirnames, filenames in os.walk(root, topdown=False):
+ if dirpath == str(root):
+ continue
+ try:
+ os.rmdir(dirpath)
+ except OSError:
+ pass
diff --git a/metrics/services/export.py b/metrics/services/export.py
new file mode 100644
index 0000000..03efbc6
--- /dev/null
+++ b/metrics/services/export.py
@@ -0,0 +1,94 @@
+import logging
+
+from django.conf import settings
+
+from metrics import opensearch
+from metrics.opensearch.names import generate_month_index_name, generate_year_index_name
+
+from . import daily_payloads
+
+
+def load_daily_metric_payload(job):
+ if not job.storage_path:
+ return None
+ try:
+ return daily_payloads.read_payload(job.storage_path)
+ except FileNotFoundError:
+ logging.warning("Daily metric payload not found for job %s.", job.pk)
+ return None
+
+
+def export_daily_metric_payload(search_client, job, payload):
+ if not job.job_id:
+ raise RuntimeError("Daily metric job has no payload hash.")
+
+ export_documents(
+ search_client=search_client,
+ documents=payload.get("documents") or {},
+ collection=payload.get("collection") or job.collection.acron3,
+ job_id=job.job_id,
+ )
+
+
+def export_documents(search_client, documents, collection, job_id):
+ if not documents:
+ return
+
+ _sync_documents_group(
+ search_client=search_client,
+ collection=collection,
+ documents=documents.get("month", {}),
+ granularity="month",
+ job_id=job_id,
+ )
+ _sync_documents_group(
+ search_client=search_client,
+ collection=collection,
+ documents=documents.get("year", {}),
+ granularity="year",
+ job_id=job_id,
+ )
+
+
+def _sync_documents_group(
+ search_client,
+ collection,
+ documents,
+ granularity,
+ job_id,
+):
+ if not documents:
+ return
+
+ grouped_documents = {}
+ index_prefix = settings.OPENSEARCH_INDEX_NAME
+
+ for doc_id, document in documents.items():
+ if granularity == "month":
+ index_name = generate_month_index_name(
+ index_prefix=index_prefix,
+ collection=collection,
+ date=f"{document.get('access_month')}-01",
+ )
+ mappings = opensearch.get_index_mappings(collection, "month")
+ else:
+ index_name = generate_year_index_name(
+ index_prefix=index_prefix,
+ collection=collection,
+ date=f"{document.get('access_year')}-01-01",
+ )
+ mappings = opensearch.get_index_mappings(collection, "year")
+
+ grouped_documents.setdefault(index_name, {"mappings": mappings, "documents": {}})
+ grouped_documents[index_name]["documents"][doc_id] = document
+
+ for index_name, payload in grouped_documents.items():
+ search_client.create_index_if_not_exists(
+ index_name=index_name,
+ mappings=payload["mappings"],
+ )
+ search_client.increment_documents_for_daily_job(
+ index_name=index_name,
+ documents=payload["documents"],
+ job_id=job_id,
+ )
diff --git a/metrics/services/jobs.py b/metrics/services/jobs.py
new file mode 100644
index 0000000..78f5100
--- /dev/null
+++ b/metrics/services/jobs.py
@@ -0,0 +1,153 @@
+import logging
+from datetime import timedelta
+
+from django.db import transaction
+from django.utils import timezone
+
+from log_manager import choices
+from log_manager.models import LogFile
+
+from metrics.models import DailyMetricJob
+
+
+def create_or_update_daily_metric_job(collection, access_date, log_files):
+ input_log_hashes = sorted(log_file.hash for log_file in log_files if log_file.hash)
+ with transaction.atomic():
+ job, _ = DailyMetricJob.objects.select_for_update().get_or_create(
+ collection=collection,
+ access_date=access_date,
+ )
+
+ if job.status == DailyMetricJob.STATUS_EXPORTED:
+ if job.input_log_hashes != input_log_hashes:
+ raise RuntimeError(
+ f"Daily metric job already exported for {collection.acron3} {access_date}. "
+ "Recompute requires deleting/recreating the affected day or period first."
+ )
+ LogFile.objects.filter(hash__in=input_log_hashes).update(
+ status=choices.LOG_FILE_STATUS_PROCESSED,
+ parse_heartbeat_at=None,
+ updated=timezone.now(),
+ )
+ return job
+
+ keep_payload = (
+ job.status == DailyMetricJob.STATUS_ERROR
+ and job.input_log_hashes == input_log_hashes
+ and job.storage_path
+ and job.payload_hash
+ )
+
+ job.input_log_hashes = input_log_hashes
+ job.status = DailyMetricJob.STATUS_PENDING
+ job.error_message = ""
+ job.export_started_at = None
+ job.exported_at = None
+ if not keep_payload:
+ job.storage_path = ""
+ job.payload_hash = ""
+ job.summary = {}
+ job.save(
+ update_fields=[
+ "input_log_hashes",
+ "status",
+ "error_message",
+ "export_started_at",
+ "exported_at",
+ "storage_path",
+ "payload_hash",
+ "summary",
+ "updated",
+ ]
+ )
+ return job
+
+
+def acquire_daily_metric_job(job_id):
+ with transaction.atomic():
+ job = (
+ DailyMetricJob.objects.select_for_update()
+ .select_related("collection")
+ .get(pk=job_id)
+ )
+ if job.status in {
+ DailyMetricJob.STATUS_EXPORTING,
+ DailyMetricJob.STATUS_EXPORTED,
+ }:
+ logging.info("Daily metric job %s is already in final/active state.", job_id)
+ return None
+
+ job.status = DailyMetricJob.STATUS_EXPORTING
+ job.attempts += 1
+ job.error_message = ""
+ job.export_started_at = timezone.now()
+ job.save(
+ update_fields=[
+ "status",
+ "attempts",
+ "error_message",
+ "export_started_at",
+ "updated",
+ ]
+ )
+ return job
+
+
+def mark_daily_metric_job_failed(job, error_message):
+ DailyMetricJob.objects.filter(pk=job.pk).update(
+ status=DailyMetricJob.STATUS_ERROR,
+ error_message=str(error_message),
+ updated=timezone.now(),
+ )
+ LogFile.objects.filter(hash__in=job.input_log_hashes).update(
+ status=choices.LOG_FILE_STATUS_ERROR,
+ parse_heartbeat_at=None,
+ updated=timezone.now(),
+ )
+
+
+def mark_daily_metric_job_exported(job, user=None):
+ DailyMetricJob.objects.filter(pk=job.pk).update(
+ status=DailyMetricJob.STATUS_EXPORTED,
+ error_message="",
+ exported_at=timezone.now(),
+ updated=timezone.now(),
+ )
+ LogFile.objects.filter(hash__in=job.input_log_hashes).update(
+ status=choices.LOG_FILE_STATUS_PROCESSED,
+ parse_heartbeat_at=None,
+ updated=timezone.now(),
+ )
+
+
+def release_stale_daily_metric_jobs(collections=None, from_date=None, until_date=None, stale_after_minutes=60):
+ cutoff = timezone.now() - timedelta(minutes=stale_after_minutes)
+ queryset = DailyMetricJob.objects.filter(
+ status=DailyMetricJob.STATUS_EXPORTING,
+ export_started_at__lt=cutoff,
+ )
+ if collections:
+ queryset = queryset.filter(collection__acron3__in=collections)
+ if from_date:
+ queryset = queryset.filter(access_date__gte=from_date)
+ if until_date:
+ queryset = queryset.filter(access_date__lte=until_date)
+
+ stale_jobs = list(queryset.only("pk", "input_log_hashes"))
+ released = queryset.update(
+ status=DailyMetricJob.STATUS_ERROR,
+ error_message="Job marked for retry after stale exporting state.",
+ updated=timezone.now(),
+ )
+ stale_hashes = {
+ log_hash
+ for job in stale_jobs
+ for log_hash in (job.input_log_hashes or [])
+ }
+ if stale_hashes:
+ LogFile.objects.filter(hash__in=stale_hashes).update(
+ status=choices.LOG_FILE_STATUS_ERROR,
+ parse_heartbeat_at=None,
+ updated=timezone.now(),
+ )
+ return released
diff --git a/metrics/services/parser.py b/metrics/services/parser.py
new file mode 100644
index 0000000..5eb3dbf
--- /dev/null
+++ b/metrics/services/parser.py
@@ -0,0 +1,249 @@
+import logging
+from datetime import timedelta
+from time import monotonic
+
+from django.conf import settings
+from django.utils import timezone
+
+from scielo_usage_counter import log_handler, url_translator
+
+from log_manager import choices
+from log_manager.models import LogFile
+from log_manager_config.models import CollectionLogDirectory
+from source.models import Source
+from document.models import Document
+from tracker.choices import (
+ LOG_FILE_DISCARDED_LINE_REASON_MISSING_DOCUMENT,
+ LOG_FILE_DISCARDED_LINE_REASON_MISSING_SOURCE,
+)
+from tracker.models import LogFileDiscardedLine
+
+from metrics.counter import access, documents as index_docs
+from metrics.counter import parser
+
+from .resources import get_log_files_for_collection_date
+from . import daily_payloads
+
+
+def process_daily_metric_job(job, robots_list, mmdb, track_errors=False):
+ log_files = get_log_files_for_collection_date(
+ collection=job.collection,
+ access_date=job.access_date,
+ )
+ if not log_files:
+ raise RuntimeError(f"No log files found for {job.collection.acron3} {job.access_date}.")
+
+ results = {}
+ summary = {
+ "log_files": len(log_files),
+ "input_log_hashes": sorted(log_file.hash for log_file in log_files if log_file.hash),
+ "lines_parsed": 0,
+ "valid_lines": 0,
+ "discarded_lines": 0,
+ }
+
+ LogFile.objects.filter(pk__in=[log_file.pk for log_file in log_files]).update(
+ status=choices.LOG_FILE_STATUS_PARSING,
+ summary={},
+ last_processed_line=0,
+ parse_heartbeat_at=timezone.now(),
+ updated=timezone.now(),
+ )
+ LogFileDiscardedLine.objects.filter(log_file_id__in=[log_file.pk for log_file in log_files]).delete()
+
+ heartbeat_interval_seconds = getattr(settings, "METRICS_PARSE_HEARTBEAT_INTERVAL_SECONDS", 30)
+
+ for log_file in log_files:
+ log_parser, url_translator_manager = setup_parsing_environment(
+ log_file=log_file,
+ robots_list=robots_list,
+ mmdb=mmdb,
+ )
+ line_count = 0
+ valid_count = 0
+ errors = []
+ last_heartbeat_monotonic = monotonic()
+
+ for line in log_parser.parse():
+ line_count += 1
+ if monotonic() - last_heartbeat_monotonic >= heartbeat_interval_seconds:
+ touch_parse_heartbeat(log_file, log_parser.stats.lines_parsed)
+ last_heartbeat_monotonic = monotonic()
+
+ is_valid_line, error_obj = process_line(
+ results=results,
+ line=line,
+ utm=url_translator_manager,
+ log_file=log_file,
+ track_errors=track_errors,
+ )
+ if is_valid_line:
+ valid_count += 1
+ else:
+ summary["discarded_lines"] += 1
+ if error_obj:
+ errors.append(error_obj)
+
+ if errors:
+ LogFileDiscardedLine.objects.bulk_create(errors)
+
+ summary["lines_parsed"] += line_count
+ summary["valid_lines"] += valid_count
+ log_file.summary = {
+ "parsing_completed": True,
+ "lines_parsed": line_count,
+ "valid_lines": valid_count,
+ }
+ log_file.last_processed_line = log_parser.stats.lines_parsed
+ log_file.parse_heartbeat_at = timezone.now()
+ log_file.save(
+ update_fields=[
+ "summary",
+ "last_processed_line",
+ "parse_heartbeat_at",
+ "updated",
+ ]
+ )
+
+ documents = index_docs.convert_raw_results_to_index_documents(results)
+ storage_path = daily_payloads.build_daily_storage_path(job.collection, job.access_date)
+ payload = {
+ "collection": job.collection.acron3,
+ "access_date": job.access_date.isoformat(),
+ "input_log_hashes": summary["input_log_hashes"],
+ "documents": documents,
+ "summary": summary,
+ }
+ payload_hash = daily_payloads.write_payload(storage_path, payload)
+
+ job.input_log_hashes = summary["input_log_hashes"]
+ job.storage_path = storage_path.as_posix()
+ job.payload_hash = payload_hash
+ job.summary = {
+ **summary,
+ "month_document_count": len(documents.get("month", {})),
+ "year_document_count": len(documents.get("year", {})),
+ }
+ job.save(
+ update_fields=[
+ "input_log_hashes",
+ "storage_path",
+ "payload_hash",
+ "summary",
+ "updated",
+ ]
+ )
+
+ return payload
+
+
+def setup_parsing_environment(log_file, robots_list, mmdb):
+ lp = log_handler.LogParser(mmdb_data=mmdb.data, robots_list=robots_list, output_mode="dict")
+ lp.logfile = log_file.path
+
+ translator_class = None
+ for cld in CollectionLogDirectory.objects.filter(config__collection=log_file.collection):
+ if cld.path in log_file.path:
+ if cld.translator_class:
+ translator_class = parser.translator_class_name_to_obj(cld.translator_class)
+ break
+
+ if not translator_class:
+ raise Exception(f"No URL translator class found for collection {log_file.collection}.")
+
+ utm = url_translator.URLTranslationManager(
+ documents_metadata=Document.metadata(collection=log_file.collection),
+ sources_metadata=Source.metadata(collection=log_file.collection),
+ translator=translator_class,
+ )
+ return lp, utm
+
+
+def process_line(results, line, utm, log_file, track_errors=False):
+ try:
+ translated_url = utm.translate(line.get("url"))
+ except Exception as exc:
+ logging.error("Error translating URL %s: %s", line.get("url"), exc)
+ return False, None
+
+ try:
+ item_access_data = access.extract_item_access_data(log_file.collection.acron3, translated_url)
+ except Exception as exc:
+ logging.error("Error extracting item access data from URL %s: %s", line.get("url"), exc)
+ return False, None
+
+ ignore_utm_validation = not track_errors
+ is_valid, check_result = access.is_valid_item_access_data(
+ item_access_data,
+ utm,
+ ignore_utm_validation,
+ )
+
+ if not is_valid:
+ if track_errors:
+ error_code = check_result.get("code")
+ if error_code in {
+ "invalid_scielo_issn",
+ "invalid_source_id",
+ "invalid_pid_v3",
+ "invalid_pid_v2",
+ "invalid_pid_generic",
+ }:
+ tracker_error_type = (
+ LOG_FILE_DISCARDED_LINE_REASON_MISSING_DOCUMENT
+ if "pid" in error_code
+ else LOG_FILE_DISCARDED_LINE_REASON_MISSING_SOURCE
+ )
+
+ return False, LogFileDiscardedLine.create(
+ log_file=log_file,
+ error_type=tracker_error_type,
+ message=check_result.get("message"),
+ data={"line": line, "item_access_data": item_access_data},
+ save=False,
+ )
+
+ return False, None
+
+ try:
+ access.update_results_with_item_access_data(results, item_access_data, line)
+ except Exception as exc:
+ logging.error("Error updating metrics results for URL %s: %s", line.get("url"), exc)
+ return False, None
+
+ return True, None
+
+
+def touch_parse_heartbeat(log_file, last_processed_line=None):
+ heartbeat_at = timezone.now()
+ update_kwargs = {
+ "parse_heartbeat_at": heartbeat_at,
+ "updated": heartbeat_at,
+ }
+ if last_processed_line is not None:
+ update_kwargs["last_processed_line"] = last_processed_line or 0
+ log_file.last_processed_line = last_processed_line or 0
+ LogFile.objects.filter(pk=log_file.pk).update(**update_kwargs)
+ log_file.parse_heartbeat_at = heartbeat_at
+
+
+def is_stale_parsing_log(log_file, stale_after_minutes=60):
+ if log_file.status != choices.LOG_FILE_STATUS_PARSING:
+ return False
+
+ if not log_file.parse_heartbeat_at:
+ return True
+
+ cutoff = timezone.now() - timedelta(minutes=stale_after_minutes)
+ return log_file.parse_heartbeat_at < cutoff
+
+
+def requeue_stale_parsing_log(log_file):
+ now = timezone.now()
+ LogFile.objects.filter(pk=log_file.pk).update(
+ status=choices.LOG_FILE_STATUS_ERROR,
+ parse_heartbeat_at=None,
+ updated=now,
+ )
+ log_file.status = choices.LOG_FILE_STATUS_ERROR
+ log_file.parse_heartbeat_at = None
diff --git a/metrics/services/resources.py b/metrics/services/resources.py
new file mode 100644
index 0000000..dc31400
--- /dev/null
+++ b/metrics/services/resources.py
@@ -0,0 +1,54 @@
+import logging
+
+from django.conf import settings
+
+from log_manager.models import LogFile
+from resources.models import MMDB, RobotUserAgent
+
+from metrics import opensearch
+
+
+def extract_celery_queue_name(collection_acronym):
+ return f"parse_{settings.COLLECTION_ACRON3_SIZE_MAP.get(collection_acronym, 'small')}"
+
+
+def fetch_required_resources(robot_source=None):
+ robots_list = RobotUserAgent.get_patterns(source=robot_source)
+ if not robots_list:
+ logging.error(
+ "There are no robots available in the database for source %s.",
+ RobotUserAgent.normalize_source(robot_source),
+ )
+ return None, None
+
+ try:
+ mmdb = MMDB.objects.latest("created")
+ except MMDB.DoesNotExist:
+ logging.error("There are no MMDB files available in the database.")
+ return None, None
+
+ return robots_list, mmdb
+
+
+def build_search_client():
+ return opensearch.OpenSearchUsageClient(
+ settings.OPENSEARCH_URL,
+ settings.OPENSEARCH_BASIC_AUTH,
+ settings.OPENSEARCH_API_KEY,
+ settings.OPENSEARCH_VERIFY_CERTS,
+ )
+
+
+def get_log_files_for_collection_date(collection, access_date, status_filters=None):
+ queryset = (
+ LogFile.objects.filter(
+ collection=collection,
+ date=access_date,
+ )
+ .select_related("collection")
+ .order_by("path", "hash")
+ )
+ if status_filters:
+ queryset = queryset.filter(status__in=status_filters)
+
+ return list(queryset)
diff --git a/metrics/tasks.py b/metrics/tasks.py
deleted file mode 100644
index 026bfb5..0000000
--- a/metrics/tasks.py
+++ /dev/null
@@ -1,508 +0,0 @@
-import logging
-
-from django.conf import settings
-from django.contrib.auth import get_user_model
-from django.utils.translation import gettext as _
-
-from scielo_usage_counter import log_handler
-from scielo_usage_counter import url_translator
-
-from config import celery_app
-from core.utils.utils import _get_user
-from core.utils.date_utils import get_date_obj, get_date_range_str
-from article.models import Article
-from collection.models import Collection
-from journal.models import Journal
-from log_manager import choices
-from log_manager_config.models import CollectionURLTranslatorClass, CollectionLogFilesPerDay, CollectionLogDirectory
-from log_manager.models import LogFile, CollectionLogFileDateCount, LogFileDate
-from resources.models import MMDB, RobotUserAgent
-from tracker.models import LogFileDiscardedLine
-from tracker.choices import LOG_FILE_DISCARDED_LINE_REASON_MISSING_ARTICLE, LOG_FILE_DISCARDED_LINE_REASON_MISSING_JOURNAL
-
-from . import es
-from .utils import parser_utils, index_utils
-
-
-User = get_user_model()
-
-
-def extract_celery_queue_name(collection_acronym):
- return f"parse_{settings.COLLECTION_ACRON3_SIZE_MAP.get(collection_acronym, 'small')}"
-
-
-@celery_app.task(bind=True, name=_('Parse logs'), timelimit=-1)
-def task_parse_logs(self, collections=[], include_logs_with_error=True, batch_size=5000, replace=False, track_errors=False, from_date=None, until_date=None, days_to_go_back=None, user_id=None, username=None):
- """
- Parses log files associated with a given collection.
-
- Args:
- collections (list, optional): List of collection acronyms to parse logs for. Defaults to all collections.
- include_logs_with_error (bool, optional): Whether to include logs with errors. Defaults to True.
- batch_size (int, optional): Number of records to process in a single batch. Defaults to 5000.
- replace (bool, optional): Whether to replace existing records. Defaults to False.
- track_errors (bool, optional): Whether to track errors in log parsing. Defaults to False.
- from_date (str, optional): Start date for log parsing in 'YYYY-MM-DD' format. Defaults to None.
- until_date (str, optional): End date for log parsing in 'YYYY-MM-DD' format. Defaults to None.
- days_to_go_back (int, optional): Number of days to go back from the current date to parse logs. Defaults to None.
- user_id
- username
-
- Returns:
- None.
- """
- from_date, until_date = get_date_range_str(from_date, until_date, days_to_go_back)
-
- from_date_obj = get_date_obj(from_date)
- until_date_obj = get_date_obj(until_date)
-
- # Set status filters based on the include_logs_with_error and replace flags
- status_filters = [choices.LOG_FILE_STATUS_QUEUED]
- if include_logs_with_error:
- status_filters.append(choices.LOG_FILE_STATUS_ERROR)
- if replace:
- status_filters.append(choices.LOG_FILE_STATUS_PROCESSED)
-
- for collection in collections or Collection.acron3_list():
- for lf in LogFile.objects.filter(status__in=status_filters, collection__acron3=collection):
- probably_date = parser_utils.extract_date_from_validation_dict(lf.validation)
- if not probably_date:
- logging.debug(f'Log file {lf.path} does not have a valid probably date.')
- continue
-
- if probably_date < from_date_obj or probably_date > until_date_obj:
- continue
-
- queue_name = extract_celery_queue_name(collection)
-
- logging.info(f'PARSING file {lf.path}')
- task_parse_log.apply_async(
- args=(lf.hash, batch_size, replace, track_errors, user_id, username),
- queue=queue_name,
- )
-
-
-@celery_app.task(bind=True, name=_('Parse one log'), timelimit=-1)
-def task_parse_log(self, log_file_hash, batch_size=5000, replace=False, track_errors=False, user_id=None, username=None):
- """
- Parses a log file, extracts relevant information, and creates processed log records in the database.
-
- Args:
- log_file_hash (str): Hash representing the log file to be parsed.
- batch_size (int, optional): Number of records to process in a single batch. Defaults to 5000.
- replace (bool, optional): Whether to replace existing records. Defaults to False.
- track_errors (bool, optional): Whether to track errors in log parsing. Defaults to False.
- user_id
- username
-
- Returns:
- None.
- """
- user = _get_user(self.request, username=username, user_id=user_id)
- robots_list, mmdb = _fetch_required_resources()
- if not robots_list or not mmdb:
- return
-
- log_file = _initialize_log_file(log_file_hash)
- if not log_file:
- return
-
- clfdc = create_or_update_collection_log_file_date_count(
- user=user,
- collection=log_file.collection,
- date=get_date_obj(log_file.validation.get('probably_date'))
- )
-
- if not replace and clfdc.is_usage_metric_computed:
- logging.info(f'Usage metric already computed for {log_file.validation.get("probably_date")}')
- return
-
- if replace:
- clfdc.exported_files_count = 0
- clfdc.is_usage_metric_computed = False
- clfdc.save()
-
- log_parser, url_translator_manager = _setup_parsing_environment(log_file, robots_list, mmdb)
- success = _process_lines(lp=log_parser, utm=url_translator_manager, log_file=log_file, batch_size=batch_size, replace=replace, track_errors=track_errors)
-
- if not success:
- logging.error(f'Failed to parse log file {log_file.path}.')
- log_file.status = choices.LOG_FILE_STATUS_ERROR
- log_file.save()
- return
-
- log_file.status = choices.LOG_FILE_STATUS_PROCESSED
- log_file.save()
-
- _update_exported_files_count(clfdc)
-
- logging.info(f'Log file {log_file.path} has been successfully parsed.')
-
-
-def create_or_update_collection_log_file_date_count(user, collection, date):
- n_expected_files = CollectionLogFilesPerDay.get_number_of_expected_files_by_day(collection=collection.acron3, date=date)
- n_found_logs = LogFileDate.get_number_of_found_files_for_date(collection=collection.acron3, date=date)
-
- return CollectionLogFileDateCount.create_or_update(
- user=user,
- collection=collection,
- date=date,
- expected_log_files=n_expected_files,
- found_log_files=n_found_logs,
- )
-
-
-def _initialize_log_file(log_file_hash):
- """
- Initializes the log file for parsing by setting its status to 'parsing'.
-
- Args:
- log_file_hash (str): The hash of the log file to be initialized.
-
- Returns:
- LogFile: The initialized log file object, or None if it does not exist.
- """
- try:
- log_file = LogFile.get(hash=log_file_hash)
- log_file.status = choices.LOG_FILE_STATUS_PARSING
- log_file.save()
- return log_file
- except LogFile.DoesNotExist:
- logging.error(f'Log file with hash {log_file_hash} does not exist.')
- return None
-
-
-def _fetch_required_resources():
- """
- Fetches the necessary resources for parsing logs, including robot user agents and MMDB files.
-
- Returns:
- tuple: A tuple containing the list of robot user agents and the latest MMDB object.
- """
- robots_list = RobotUserAgent.get_all_patterns()
- if not robots_list:
- logging.error('There are no robots available in the database.')
- return None, None
-
- mmdb = MMDB.objects.latest('created')
- if not mmdb:
- logging.error('There are no MMDB files available in the database.')
- return None, None
-
- return robots_list, mmdb
-
-
-def _setup_parsing_environment(log_file, robots_list, mmdb):
- """
- Sets up the environment for parsing the log file, including initializing the log parser and URL translator manager.
-
- Args:
- log_file (LogFile): The log file to be parsed.
- robots_list (list): List of robot user agents.
- mmdb (MMDB): The MMDB object containing geolocation data.
-
- Returns:
- tuple: A tuple containing the LogParser instance and URLTranslationManager instance.
- """
- lp = log_handler.LogParser(mmdb_data=mmdb.data, robots_list=robots_list, output_mode='dict')
- lp.logfile = log_file.path
-
- translator_class = None
- for cld in CollectionLogDirectory.objects.filter(collection=log_file.collection):
- if cld.path in log_file.path:
- try:
- translator_class_name = CollectionURLTranslatorClass.objects.get(collection=log_file.collection, directory=cld).translator_class
- translator_class = parser_utils.translator_class_name_to_obj(translator_class_name)
- break
- except CollectionURLTranslatorClass.DoesNotExist:
- continue
-
- if not translator_class:
- raise Exception(f'No URL translator class found for collection {log_file.collection}.')
-
- logging.info(f'Creating URL translation manager for {log_file.collection}')
- utm = url_translator.URLTranslationManager(
- articles_metadata=Article.metadata(collection=log_file.collection),
- journals_metadata=Journal.metadata(collection=log_file.collection),
- translator=translator_class,
- )
- return lp, utm
-
-
-def _process_lines(lp, utm, log_file, batch_size=5000, replace=False, track_errors=False):
- """
- Processes each line of the log file, translating URLs and registering item accesses.
-
- Args:
- lp (LogParser): The log parser instance.
- utm (URLTranslationManager): The URL translation manager instance.
- log_file (LogFile): The log file being processed.
- batch_size (int, optional): Number of records to process in a single batch. Defaults to 5000.
- replace (bool, optional): Whether to replace existing records. Defaults to False.
- track_errors (bool, optional): Whether to track errors in log parsing. Defaults to False.
-
- Returns:
- None.
- """
- logging.info(f'Processing {lp.logfile}')
- results = {}
- errors = []
-
- jump = log_file.last_processed_line if not replace else 0
-
- es_manager = es.ElasticSearchUsageWrapper(
- settings.ES_URL,
- settings.ES_BASIC_AUTH,
- settings.ES_API_KEY,
- settings.ES_VERIFY_CERTS
- )
-
- if not es_manager.ping():
- logging.error('Elasticsearch client is not available.')
- return False
-
- index_name = index_utils.generate_index_name(
- index_prefix=settings.ES_INDEX_NAME,
- collection=log_file.collection.acron3,
- date=log_file.validation.get('probably_date')
- )
-
- es_manager.create_index_if_not_exists(index_name=index_name)
-
- if replace:
- logging.info(f'Removing existing documents for collection {log_file.collection.acron3} and date {log_file.validation.get("probably_date")}')
- delete_success = es_manager.delete_documents_by_key(
- index_name=index_name,
- data={'collection': log_file.collection.acron3, 'date': log_file.validation.get('probably_date')},
- )
- if not delete_success:
- logging.error(f'Failed to delete existing documents for collection {log_file.collection.acron3} and date {log_file.validation.get("probably_date")}')
- return False
-
- for line in lp.parse():
- if lp.stats.lines_parsed < jump:
- continue
-
- if lp.stats.lines_parsed % batch_size == 0:
- logging.info(f'Processing line {lp.stats.lines_parsed} of {lp.logfile}')
-
- is_valid_line, error_obj = _process_line(results, line, utm, log_file, track_errors)
- if not is_valid_line:
- if error_obj:
- errors.append(error_obj)
-
- if len(errors) >= batch_size:
- LogFileDiscardedLine.objects.bulk_create(errors)
- errors = []
- continue
-
- if len(results) >= batch_size:
- logging.info(f'Indexing data for log file {log_file.path}')
- es_manager.export_to_index(
- index_name=index_name,
- data=results,
- batch_size=batch_size
- )
- results = {}
-
- _update_log_file_summary(log_file, lp.stats.get_stats())
-
- logging.info(f'Indexing data for log file {log_file.path}')
- es_manager.export_to_index(
- index_name=index_name,
- data=results,
- batch_size=batch_size
- )
- results = {}
-
- LogFileDiscardedLine.objects.bulk_create(errors) if errors else None
- errors = []
-
- _update_log_file_summary(log_file, lp.stats.get_stats())
-
- return True
-
-
-def _update_log_file_summary(log_file, stats):
- if not stats:
- logging.warning(f'No stats available for log file {log_file.path}. Skipping summary update.')
- return
-
- summary_k, summary_v = stats
- log_file.summary = dict(zip(summary_k, summary_v))
- log_file.last_processed_line = log_file.summary.get('lines_parsed', 0)
- log_file.save()
-
-
-def _update_exported_files_count(collection_log_file_date: CollectionLogFileDateCount):
- collection_log_file_date.exported_files_count += 1
- collection_log_file_date.set_is_usage_metric_computed()
- collection_log_file_date.save()
-
-
-def _process_line(results, line, utm, log_file, track_errors=False):
- """
- Process a single log line to extract and validate item access data.
- This function translates a URL from the log line, extracts item access data,
- validates the data, and updates the results if the data is valid.
-
- Args:
- results: Dictionary or data structure to store processed results
- line (dict): Log line containing URL and other access information
- utm: URL translation manager for converting URLs
- log_file: Log file object containing collection information (must have collection.acron3)
- track_errors (bool): Whether to track errors in log parsing.
-
- Returns:
- tuple: A tuple containing a boolean indicating success or failure, and an optional LogFileDiscardedLine object.
-
- Raises:
- Logs errors for URL translation failures and item access data extraction failures.
- Logs debug messages for invalid item access data.
- """
- try:
- translated_url = utm.translate(line.get('url'))
- except Exception as e:
- logging.error(f'Error translating URL {line.get("url")}: {e}')
- return False, None
-
- try:
- item_access_data = index_utils.extract_item_access_data(log_file.collection.acron3, translated_url)
- except Exception as e:
- logging.error(f'Error extracting item access data from URL {line.get("url")}: {e}')
- return False, None
-
- ignore_utm_validation = not track_errors
- is_valid, check_result = index_utils.is_valid_item_access_data(item_access_data, utm, ignore_utm_validation)
-
- if not is_valid:
- if track_errors:
- error_code = check_result.get('code')
-
- if error_code in {
- 'invalid_scielo_issn',
- 'invalid_pid_v3',
- 'invalid_pid_v2',
- 'invalid_pid_generic'
- }:
- if 'pid' in error_code:
- tracker_error_type = LOG_FILE_DISCARDED_LINE_REASON_MISSING_ARTICLE
- else:
- tracker_error_type = LOG_FILE_DISCARDED_LINE_REASON_MISSING_JOURNAL
-
- lfdl = LogFileDiscardedLine.create(
- log_file=log_file,
- error_type=tracker_error_type,
- message=check_result.get('message'),
- data={'line': line, 'item_access_data': item_access_data},
- save=False,
- )
- logging.debug(f'Invalid item access data: {check_result.get("message")}. Line: {line}. Item Access Data: {item_access_data}')
- return False, lfdl
-
- return False, None
-
- index_utils.update_results_with_item_access_data(
- results,
- item_access_data,
- line
- )
-
- return True, None
-
-
-@celery_app.task(bind=True, name=_('Create index'), timelimit=-1)
-def task_create_index(self, index_name, mappings=None, user_id=None, username=None):
- """
- Creates an Elasticsearch index with the specified settings and mappings.
-
- Args:
- index_name (str): The name of the index to be created.
- mappings (dict, optional): The mappings for the index. Defaults to None.
- user_id (int, optional): The ID of the user initiating the task. Defaults to None.
- username (str, optional): The username of the user initiating the task. Defaults to None.
-
- Returns:
- None.
- """
- user = _get_user(self.request, username=username, user_id=user_id)
- es_manager = es.ElasticSearchUsageWrapper(
- settings.ES_URL,
- settings.ES_BASIC_AUTH,
- settings.ES_API_KEY,
- settings.ES_VERIFY_CERTS
- )
-
- try:
- if es_manager.client.indices.exists(index=index_name):
- logging.info(f"Index {index_name} already exists.")
- return
-
- es_manager.create_index(index_name=index_name, mappings=mappings)
- logging.info(f"Index {index_name} created successfully.")
- except Exception as e:
- logging.error(f"Failed to create index {index_name}: {e}")
-
-
-@celery_app.task(bind=True, name=_('Delete index'), timelimit=-1)
-def task_delete_index(self, index_name, user_id=None, username=None):
- """
- Deletes an Elasticsearch index.
-
- Args:
- index_name (str): The name of the index to be deleted.
- user_id (int, optional): The ID of the user initiating the task. Defaults to None.
- username (str, optional): The username of the user initiating the task. Defaults to None.
-
- Returns:
- None.
- """
- user = _get_user(self.request, username=username, user_id=user_id)
- es_manager = es.ElasticSearchUsageWrapper(
- settings.ES_URL,
- settings.ES_BASIC_AUTH,
- settings.ES_API_KEY,
- settings.ES_VERIFY_CERTS
- )
-
- try:
- if not es_manager.client.indices.exists(index=index_name):
- logging.info(f"Index {index_name} does not exist.")
- return
-
- es_manager.client.indices.delete(index=index_name)
- logging.info(f"Index {index_name} deleted successfully.")
- except Exception as e:
- logging.error(f"Failed to delete index {index_name}: {e}")
-
-
-@celery_app.task(bind=True, name=_('Delete documents by key'), timelimit=-1)
-def task_delete_documents_by_key(self, index_name, data, user_id=None, username=None):
- """
- Deletes documents from Elasticsearch based on the provided keys and values.
-
- Args:
- index_name (str): The name of the Elasticsearch index. Defaults to settings.ES_INDEX_NAME.
- data (dict): A dictionary where keys are field names and values are the corresponding values to match for deletion.
- user_id (int, optional): The ID of the user initiating the task. Defaults to None.
- username (str, optional): The username of the user initiating the task. Defaults to None.
-
- Returns:
- None.
- """
- user = _get_user(self.request, username=username, user_id=user_id)
- es_manager = es.ElasticSearchUsageWrapper(
- settings.ES_URL,
- settings.ES_BASIC_AUTH,
- settings.ES_API_KEY,
- settings.ES_VERIFY_CERTS
- )
-
- try:
- es_manager.delete_documents_by_key(
- index_name=index_name,
- data=data,
- )
- logging.info(f"Successfully deleted documents with data: {data} from index {index_name}.")
- except Exception as e:
- logging.error(f"Failed to delete documents with data {data} from index {index_name}: {e}")
diff --git a/metrics/tasks/__init__.py b/metrics/tasks/__init__.py
new file mode 100644
index 0000000..f0c2d6a
--- /dev/null
+++ b/metrics/tasks/__init__.py
@@ -0,0 +1,19 @@
+from .parse import (
+ task_parse_logs,
+ task_wait_parse_logs_wave,
+)
+from .process import (
+ task_process_daily_metric_job,
+)
+from .resume import (
+ task_resume_log_exports,
+ task_resume_stale_parsing_logs,
+)
+from .index import (
+ task_create_index,
+ task_delete_index,
+ task_delete_documents_by_key,
+)
+from .cleanup import (
+ task_cleanup_daily_payloads,
+)
diff --git a/metrics/tasks/cleanup.py b/metrics/tasks/cleanup.py
new file mode 100644
index 0000000..9b3c8e0
--- /dev/null
+++ b/metrics/tasks/cleanup.py
@@ -0,0 +1,31 @@
+import logging
+
+from django.utils.translation import gettext as _
+
+from config import celery_app
+from core.utils.request_utils import _get_user
+from metrics.services import daily_payloads
+
+
+@celery_app.task(bind=True, name=_("[Metrics] Cleanup Daily Payloads"), timelimit=-1)
+def task_cleanup_daily_payloads(
+ self,
+ collections=None,
+ older_than_days=7,
+ user_id=None,
+ username=None,
+):
+ _get_user(self.request, username=username, user_id=user_id)
+
+ deleted_count = daily_payloads.cleanup_exported_payloads(
+ collections=collections or [],
+ older_than_days=older_than_days,
+ )
+
+ logging.info(
+ "Cleanup task completed: %s payload file(s) deleted (collections=%s, older_than_days=%s).",
+ deleted_count,
+ collections or "all",
+ older_than_days,
+ )
+ return {"deleted_payloads": deleted_count}
diff --git a/metrics/tasks/index.py b/metrics/tasks/index.py
new file mode 100644
index 0000000..2635377
--- /dev/null
+++ b/metrics/tasks/index.py
@@ -0,0 +1,61 @@
+import logging
+
+from django.utils.translation import gettext as _
+
+from config import celery_app
+from core.utils.request_utils import _get_user
+
+from metrics.services.resources import build_search_client
+
+
+@celery_app.task(bind=True, name=_("[Metrics] Create Index"), timelimit=-1)
+def task_create_index(self, index_name, mappings=None, user_id=None, username=None):
+ _get_user(self.request, username=username, user_id=user_id)
+ search_client = build_search_client()
+
+ try:
+ if search_client.client.indices.exists(index=index_name):
+ logging.info("Index %s already exists.", index_name)
+ return
+
+ search_client.create_index(index_name=index_name, mappings=mappings or {})
+ logging.info("Index %s created successfully.", index_name)
+ except Exception as exc:
+ logging.error("Failed to create index %s: %s", index_name, exc)
+
+
+@celery_app.task(bind=True, name=_("[Metrics] Delete Index"), timelimit=-1)
+def task_delete_index(self, index_name, user_id=None, username=None):
+ _get_user(self.request, username=username, user_id=user_id)
+ search_client = build_search_client()
+
+ try:
+ if not search_client.client.indices.exists(index=index_name):
+ logging.info("Index %s does not exist.", index_name)
+ return
+
+ search_client.delete_index(index_name=index_name)
+ logging.info("Index %s deleted successfully.", index_name)
+ except Exception as exc:
+ logging.error("Failed to delete index %s: %s", index_name, exc)
+
+
+@celery_app.task(bind=True, name=_("[Metrics] Delete Documents by Key"), timelimit=-1)
+def task_delete_documents_by_key(self, index_name, data, user_id=None, username=None):
+ _get_user(self.request, username=username, user_id=user_id)
+ search_client = build_search_client()
+
+ try:
+ search_client.delete_documents_by_key(index_name=index_name, data=data)
+ logging.info(
+ "Successfully deleted documents with data: %s from index %s.",
+ data,
+ index_name,
+ )
+ except Exception as exc:
+ logging.error(
+ "Failed to delete documents with data %s from index %s: %s",
+ data,
+ index_name,
+ exc,
+ )
diff --git a/metrics/tasks/parse.py b/metrics/tasks/parse.py
new file mode 100644
index 0000000..7748922
--- /dev/null
+++ b/metrics/tasks/parse.py
@@ -0,0 +1,286 @@
+import logging
+
+from django.utils.translation import gettext as _
+
+from config import celery_app
+from core.utils.date_utils import get_date_obj, get_date_range_str
+from core.utils.request_utils import _get_user
+from collection.models import Collection
+from log_manager import choices
+from log_manager.models import LogFile
+from metrics.models import DailyMetricJob
+
+from metrics.services.resources import extract_celery_queue_name, get_log_files_for_collection_date
+from metrics.services.jobs import create_or_update_daily_metric_job
+from metrics.tasks.process import task_process_daily_metric_job
+
+AUTO_REEXECUTE_POLL_INTERVAL_SECONDS = 30
+
+
+@celery_app.task(bind=True, name=_("[Log Pipeline] 3. Parse Logs (Manual)"), timelimit=-1)
+def task_parse_logs(
+ self,
+ collections=None,
+ include_logs_with_error=True,
+ batch_size=5000,
+ max_log_files=None,
+ auto_reexecute=False,
+ replace=False,
+ track_errors=False,
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ queue_name=None,
+ user_id=None,
+ username=None,
+ skip_log_hashes=None,
+ robots_source=None,
+):
+ if replace:
+ raise ValueError(
+ "replace=True is not supported. Recompute requires deleting/recreating "
+ "the affected day or period first."
+ )
+
+ from_date, until_date = get_date_range_str(from_date, until_date, days_to_go_back)
+ from_date_obj = get_date_obj(from_date)
+ until_date_obj = get_date_obj(until_date)
+ enqueued_jobs = 0
+ reached_max_log_files = False
+ enqueued_wave_job_ids = []
+ claimed_status_filters = list(_build_parse_status_filters(include_logs_with_error))
+ skip_log_hashes = set(skip_log_hashes or [])
+
+ for collection in collections or Collection.acron3_list():
+ collection_obj = Collection.objects.filter(acron3=collection).first()
+ if not collection_obj:
+ continue
+
+ access_dates = _find_access_dates(
+ collection=collection_obj,
+ from_date=from_date,
+ until_date=until_date,
+ from_date_obj=from_date_obj,
+ until_date_obj=until_date_obj,
+ status_filters=claimed_status_filters,
+ skip_log_hashes=skip_log_hashes,
+ )
+
+ for access_date in access_dates:
+ log_files = get_log_files_for_collection_date(
+ collection=collection_obj,
+ access_date=access_date,
+ status_filters=claimed_status_filters,
+ )
+ log_files = [log_file for log_file in log_files if log_file.hash not in skip_log_hashes]
+ if not log_files:
+ continue
+
+ job = create_or_update_daily_metric_job(
+ collection=collection_obj,
+ access_date=access_date,
+ log_files=log_files,
+ )
+ if job.status == DailyMetricJob.STATUS_EXPORTED:
+ continue
+
+ task_process_daily_metric_job.apply_async(
+ args=(job.pk, track_errors, user_id, username, robots_source),
+ queue=queue_name or extract_celery_queue_name(collection),
+ )
+ enqueued_wave_job_ids.append(job.pk)
+ enqueued_jobs += 1
+ if max_log_files and enqueued_jobs >= max_log_files:
+ reached_max_log_files = True
+ break
+
+ if reached_max_log_files:
+ break
+
+ auto_reexecution_enqueued = _schedule_parse_logs_reexecution(
+ should_reexecute=auto_reexecute and reached_max_log_files and bool(enqueued_wave_job_ids),
+ wave_job_ids=enqueued_wave_job_ids,
+ collections=collections,
+ include_logs_with_error=include_logs_with_error,
+ batch_size=batch_size,
+ max_log_files=max_log_files,
+ auto_reexecute=auto_reexecute,
+ replace=replace,
+ track_errors=track_errors,
+ from_date=from_date,
+ until_date=until_date,
+ days_to_go_back=days_to_go_back,
+ queue_name=queue_name,
+ user_id=user_id,
+ username=username,
+ skip_log_hashes=sorted(skip_log_hashes),
+ robots_source=robots_source,
+ )
+
+ return {
+ "enqueued_logs": enqueued_jobs,
+ "enqueued_jobs": enqueued_jobs,
+ "reached_max_log_files": reached_max_log_files,
+ "auto_reexecution_enqueued": auto_reexecution_enqueued,
+ }
+
+
+def _build_parse_status_filters(include_logs_with_error):
+ status_filters = [choices.LOG_FILE_STATUS_QUEUED]
+ if include_logs_with_error:
+ status_filters.append(choices.LOG_FILE_STATUS_ERROR)
+ return tuple(status_filters)
+
+
+def _find_access_dates(
+ collection,
+ from_date,
+ until_date,
+ from_date_obj,
+ until_date_obj,
+ status_filters,
+ skip_log_hashes,
+):
+ date_queryset = (
+ LogFile.objects.filter(
+ status__in=status_filters,
+ collection=collection,
+ date__gte=from_date_obj,
+ date__lte=until_date_obj,
+ )
+ .exclude(hash__in=skip_log_hashes)
+ .values_list("date", flat=True)
+ .distinct()
+ .order_by("date")
+ )
+
+ access_dates = set()
+ for value in list(date_queryset):
+ access_date = value if hasattr(value, "isoformat") else get_date_obj(value)
+ if access_date and from_date_obj <= access_date <= until_date_obj:
+ access_dates.add(access_date)
+ return sorted(access_dates)
+
+
+def _schedule_parse_logs_reexecution(
+ should_reexecute,
+ wave_job_ids,
+ collections,
+ include_logs_with_error,
+ batch_size,
+ max_log_files,
+ auto_reexecute,
+ replace,
+ track_errors,
+ from_date,
+ until_date,
+ days_to_go_back,
+ queue_name,
+ user_id,
+ username,
+ skip_log_hashes,
+ robots_source=None,
+):
+ if not should_reexecute:
+ return False
+
+ kwargs = {
+ "wave_job_ids": wave_job_ids,
+ "collections": collections,
+ "include_logs_with_error": include_logs_with_error,
+ "batch_size": batch_size,
+ "max_log_files": max_log_files,
+ "auto_reexecute": auto_reexecute,
+ "replace": replace,
+ "track_errors": track_errors,
+ "from_date": from_date,
+ "until_date": until_date,
+ "days_to_go_back": days_to_go_back,
+ "queue_name": queue_name,
+ "user_id": user_id,
+ "username": username,
+ "skip_log_hashes": skip_log_hashes,
+ "poll_interval_seconds": AUTO_REEXECUTE_POLL_INTERVAL_SECONDS,
+ }
+ if robots_source is not None:
+ kwargs["robots_source"] = robots_source
+
+ task_wait_parse_logs_wave.apply_async(kwargs=kwargs)
+ return True
+
+
+@celery_app.task(bind=True, name=_("[Metrics] Wait Parse Logs Wave"), timelimit=-1)
+def task_wait_parse_logs_wave(
+ self,
+ wave_job_ids=None,
+ collections=None,
+ include_logs_with_error=True,
+ batch_size=5000,
+ max_log_files=None,
+ auto_reexecute=False,
+ replace=False,
+ track_errors=False,
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ queue_name=None,
+ user_id=None,
+ username=None,
+ skip_log_hashes=None,
+ poll_interval_seconds=AUTO_REEXECUTE_POLL_INTERVAL_SECONDS,
+ robots_source=None,
+ wave_log_hashes=None,
+):
+ wave_job_ids = wave_job_ids or wave_log_hashes or []
+ if DailyMetricJob.objects.filter(
+ pk__in=wave_job_ids,
+ status__in=[DailyMetricJob.STATUS_PENDING, DailyMetricJob.STATUS_EXPORTING],
+ ).exists():
+ kwargs = {
+ "wave_job_ids": wave_job_ids,
+ "collections": collections,
+ "include_logs_with_error": include_logs_with_error,
+ "batch_size": batch_size,
+ "max_log_files": max_log_files,
+ "auto_reexecute": auto_reexecute,
+ "replace": replace,
+ "track_errors": track_errors,
+ "from_date": from_date,
+ "until_date": until_date,
+ "days_to_go_back": days_to_go_back,
+ "queue_name": queue_name,
+ "user_id": user_id,
+ "username": username,
+ "skip_log_hashes": skip_log_hashes,
+ "poll_interval_seconds": poll_interval_seconds,
+ }
+ if robots_source is not None:
+ kwargs["robots_source"] = robots_source
+
+ task_wait_parse_logs_wave.apply_async(
+ kwargs=kwargs,
+ countdown=poll_interval_seconds,
+ )
+ return {"wave_completed": False, "reexecution_enqueued": False}
+
+ kwargs = {
+ "collections": collections,
+ "include_logs_with_error": include_logs_with_error,
+ "batch_size": batch_size,
+ "max_log_files": max_log_files,
+ "auto_reexecute": auto_reexecute,
+ "replace": replace,
+ "track_errors": track_errors,
+ "from_date": from_date,
+ "until_date": until_date,
+ "days_to_go_back": days_to_go_back,
+ "queue_name": queue_name,
+ "user_id": user_id,
+ "username": username,
+ "skip_log_hashes": skip_log_hashes,
+ }
+ if robots_source is not None:
+ kwargs["robots_source"] = robots_source
+
+ task_parse_logs.apply_async(kwargs=kwargs)
+ return {"wave_completed": True, "reexecution_enqueued": True}
diff --git a/metrics/tasks/process.py b/metrics/tasks/process.py
new file mode 100644
index 0000000..ecdc7a5
--- /dev/null
+++ b/metrics/tasks/process.py
@@ -0,0 +1,63 @@
+import logging
+
+from django.utils.translation import gettext as _
+
+from config import celery_app
+from core.utils.request_utils import _get_user
+from metrics.models import DailyMetricJob
+
+from metrics.services.jobs import acquire_daily_metric_job, mark_daily_metric_job_exported, mark_daily_metric_job_failed
+from metrics.services.export import export_daily_metric_payload, load_daily_metric_payload
+from metrics.services.resources import build_search_client, fetch_required_resources
+from metrics.services.parser import process_daily_metric_job
+
+
+@celery_app.task(bind=True, name=_("[Metrics] Process Daily Job"), timelimit=-1)
+def task_process_daily_metric_job(
+ self,
+ job_id,
+ track_errors=False,
+ user_id=None,
+ username=None,
+ robots_source=None,
+):
+ user = _get_user(self.request, username=username, user_id=user_id)
+
+ try:
+ job = acquire_daily_metric_job(job_id)
+ except DailyMetricJob.DoesNotExist:
+ logging.error("Daily metric job %s does not exist.", job_id)
+ return
+
+ if not job:
+ return
+
+ try:
+ payload = load_daily_metric_payload(job)
+ if payload is None or not job.payload_hash:
+ robots_list, mmdb = fetch_required_resources(robot_source=robots_source)
+ if not robots_list or not mmdb:
+ raise RuntimeError("Required parsing resources are not available.")
+ payload = process_daily_metric_job(
+ job=job,
+ robots_list=robots_list,
+ mmdb=mmdb,
+ track_errors=track_errors,
+ )
+ job.refresh_from_db()
+
+ search_client = build_search_client()
+ if not search_client.ping():
+ raise RuntimeError("OpenSearch client is not available.")
+
+ export_daily_metric_payload(
+ search_client=search_client,
+ job=job,
+ payload=payload,
+ )
+ except Exception as exc:
+ logging.error("Failed to process daily metric job %s: %s", job_id, exc)
+ mark_daily_metric_job_failed(job, exc)
+ return
+
+ mark_daily_metric_job_exported(job, user=user)
diff --git a/metrics/tasks/resume.py b/metrics/tasks/resume.py
new file mode 100644
index 0000000..c0fe705
--- /dev/null
+++ b/metrics/tasks/resume.py
@@ -0,0 +1,166 @@
+import logging
+
+from django.utils import timezone
+from django.utils.translation import gettext as _
+
+from config import celery_app
+from core.utils.date_utils import get_date_obj, get_date_range_str
+from core.utils.request_utils import _get_user
+from log_manager import choices
+from log_manager.models import LogFile
+from metrics.models import DailyMetricJob
+
+from metrics.services.jobs import create_or_update_daily_metric_job, release_stale_daily_metric_jobs
+from metrics.services.resources import extract_celery_queue_name, get_log_files_for_collection_date
+from metrics.services.parser import is_stale_parsing_log, requeue_stale_parsing_log
+from metrics.counter import parser
+
+from .parse import task_parse_logs
+from .process import task_process_daily_metric_job
+
+
+@celery_app.task(bind=True, name=_("[Metrics] Resume Log Exports"), timelimit=-1)
+def task_resume_log_exports(
+ self,
+ collections=None,
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ stale_after_minutes=60,
+ queue_name=None,
+ user_id=None,
+ username=None,
+ robots_source=None,
+):
+ _get_user(self.request, username=username, user_id=user_id)
+
+ from_date, until_date = get_date_range_str(from_date, until_date, days_to_go_back)
+ from_date_obj = get_date_obj(from_date)
+ until_date_obj = get_date_obj(until_date)
+
+ released_stale_jobs = release_stale_daily_metric_jobs(
+ collections=collections,
+ from_date=from_date_obj,
+ until_date=until_date_obj,
+ stale_after_minutes=stale_after_minutes,
+ )
+ queryset = DailyMetricJob.objects.filter(
+ status__in=[DailyMetricJob.STATUS_PENDING, DailyMetricJob.STATUS_ERROR],
+ access_date__gte=from_date_obj,
+ access_date__lte=until_date_obj,
+ ).select_related("collection").order_by("access_date", "collection__acron3")
+ if collections:
+ queryset = queryset.filter(collection__acron3__in=collections)
+
+ resumed_jobs = 0
+ for job in queryset:
+ log_files = get_log_files_for_collection_date(
+ collection=job.collection,
+ access_date=job.access_date,
+ status_filters=[
+ choices.LOG_FILE_STATUS_QUEUED,
+ choices.LOG_FILE_STATUS_ERROR,
+ ],
+ )
+ if log_files:
+ job = create_or_update_daily_metric_job(
+ collection=job.collection,
+ access_date=job.access_date,
+ log_files=log_files,
+ )
+ elif not (job.storage_path and job.payload_hash):
+ logging.warning(
+ "Skipping daily metric job %s: no queued/error logs or stored payload.",
+ job.pk,
+ )
+ continue
+
+ if job.status == DailyMetricJob.STATUS_EXPORTED:
+ continue
+
+ task_process_daily_metric_job.apply_async(
+ args=(job.pk, False, user_id, username, robots_source),
+ queue=queue_name or extract_celery_queue_name(job.collection.acron3),
+ )
+ resumed_jobs += 1
+
+ logging.info(
+ "Resumed daily metric jobs for %s day(s); released %s stale job(s) at %s.",
+ resumed_jobs,
+ released_stale_jobs,
+ timezone.now(),
+ )
+ return {
+ "resumed_logs": resumed_jobs,
+ "resumed_jobs": resumed_jobs,
+ "released_stale_batches": released_stale_jobs,
+ "released_stale_jobs": released_stale_jobs,
+ }
+
+
+@celery_app.task(bind=True, name=_("[Metrics] Resume Stale Parsing Logs"), timelimit=-1)
+def task_resume_stale_parsing_logs(
+ self,
+ collections=None,
+ batch_size=5000,
+ track_errors=False,
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ stale_after_minutes=60,
+ max_log_files=None,
+ queue_name=None,
+ user_id=None,
+ username=None,
+ robots_source=None,
+):
+ from_date, until_date = get_date_range_str(from_date, until_date, days_to_go_back)
+ from_date_obj = get_date_obj(from_date)
+ until_date_obj = get_date_obj(until_date)
+
+ queryset = (
+ LogFile.objects.filter(status=choices.LOG_FILE_STATUS_PARSING)
+ .select_related("collection")
+ .order_by("validation__probably_date", "path", "hash")
+ )
+ if collections:
+ queryset = queryset.filter(collection__acron3__in=collections)
+
+ resumed_logs = 0
+ for log_file in queryset:
+ probably_date = parser.extract_date_from_validation_dict(log_file.validation)
+ if not probably_date or probably_date < from_date_obj or probably_date > until_date_obj:
+ continue
+ if not is_stale_parsing_log(log_file, stale_after_minutes=stale_after_minutes):
+ continue
+
+ requeue_stale_parsing_log(log_file)
+ resumed_logs += 1
+ if max_log_files and resumed_logs >= max_log_files:
+ break
+
+ apply_kwargs = {
+ "kwargs": {
+ "collections": collections,
+ "include_logs_with_error": True,
+ "batch_size": batch_size,
+ "max_log_files": max_log_files,
+ "auto_reexecute": False,
+ "replace": False,
+ "track_errors": track_errors,
+ "from_date": from_date,
+ "until_date": until_date,
+ "days_to_go_back": None,
+ "queue_name": queue_name,
+ "user_id": user_id,
+ "username": username,
+ "robots_source": robots_source,
+ }
+ }
+ if queue_name:
+ apply_kwargs["queue"] = queue_name
+ task_parse_logs.apply_async(**apply_kwargs)
+ return {
+ "stale_logs_marked_for_retry": resumed_logs,
+ "parse_logs_enqueued": True,
+ }
diff --git a/metrics/templates/search/indexes/metrics/top100articles_text.txt b/metrics/templates/search/indexes/metrics/top100articles_text.txt
deleted file mode 100644
index ccf5e94..0000000
--- a/metrics/templates/search/indexes/metrics/top100articles_text.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-{{ object.collection }}
-{{ object.key_issn }}
-{{ object.pid }}
-{{ object.yop }}
-{{ object.language }}
-{{ object.country }}
-{{ object.total_item_requests }}
-{{ object.total_item_investigations }}
-{{ object.unique_item_requests }}
-{{ object.unique_item_investigations }}
\ No newline at end of file
diff --git a/metrics/tests/test_cleanup.py b/metrics/tests/test_cleanup.py
new file mode 100644
index 0000000..e08fa9c
--- /dev/null
+++ b/metrics/tests/test_cleanup.py
@@ -0,0 +1,283 @@
+import json
+import os
+import shutil
+import tempfile
+import time
+from datetime import date
+from pathlib import Path
+from unittest.mock import patch
+
+from django.test import TestCase
+
+from collection.models import Collection
+from metrics.models import DailyMetricJob
+from metrics.services import daily_payloads
+
+
+class CleanupExportedPayloadsTests(TestCase):
+ @classmethod
+ def setUpClass(cls):
+ super().setUpClass()
+ cls._tmpdir = tempfile.TemporaryDirectory()
+ cls._patched_root = patch.object(
+ daily_payloads,
+ "get_daily_payload_root",
+ return_value=Path(cls._tmpdir.name),
+ )
+ cls._patched_root.start()
+
+ @classmethod
+ def tearDownClass(cls):
+ cls._patched_root.stop()
+ cls._tmpdir.cleanup()
+ super().tearDownClass()
+
+ def setUp(self):
+ self.collection = Collection.objects.create(acron3="books", acron2="bk")
+ self.other_collection = Collection.objects.create(acron3="scl", acron2="sc")
+
+ self.payload_root = daily_payloads.get_daily_payload_root()
+ self._clean_temp_dir()
+
+ def _clean_temp_dir(self):
+ root = self.payload_root
+ if root.exists():
+ for item in root.iterdir():
+ if item.is_dir():
+ shutil.rmtree(item)
+ else:
+ item.unlink()
+
+ def _create_job(self, collection, access_date, status, storage_path, payload_hash):
+ return DailyMetricJob.objects.create(
+ collection=collection,
+ access_date=access_date,
+ status=status,
+ storage_path=storage_path,
+ payload_hash=payload_hash,
+ )
+
+ def _write_payload_file(self, storage_path):
+ resolved = daily_payloads.resolve_storage_path(storage_path)
+ resolved.parent.mkdir(parents=True, exist_ok=True)
+ resolved.write_text(json.dumps({"test": True}), encoding="utf-8")
+ return resolved
+
+ def _set_file_age(self, file_path, days_old):
+ old_time = time.time() - days_old * 86400
+ os.utime(file_path, (old_time, old_time))
+
+ def test_cleanup_deletes_old_exported_payloads(self):
+ path = daily_payloads.build_daily_storage_path(
+ self.collection, date(2012, 3, 10)
+ )
+ resolved = self._write_payload_file(path)
+ self._set_file_age(resolved, 30)
+
+ self._create_job(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ status=DailyMetricJob.STATUS_EXPORTED,
+ storage_path=path.as_posix(),
+ payload_hash="abc",
+ )
+
+ result = daily_payloads.cleanup_exported_payloads(older_than_days=7)
+
+ self.assertEqual(result, 1)
+ self.assertFalse(resolved.exists())
+
+ def test_cleanup_skips_recent_files(self):
+ path = daily_payloads.build_daily_storage_path(
+ self.collection, date(2012, 3, 10)
+ )
+ resolved = self._write_payload_file(path)
+
+ self._create_job(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ status=DailyMetricJob.STATUS_EXPORTED,
+ storage_path=path.as_posix(),
+ payload_hash="abc",
+ )
+
+ result = daily_payloads.cleanup_exported_payloads(older_than_days=7)
+
+ self.assertEqual(result, 0)
+ self.assertTrue(resolved.exists())
+
+ def test_cleanup_skips_non_exported_jobs(self):
+ statuses = [
+ DailyMetricJob.STATUS_PENDING,
+ DailyMetricJob.STATUS_ERROR,
+ DailyMetricJob.STATUS_EXPORTING,
+ ]
+ paths = []
+ for i, status in enumerate(statuses):
+ access_date = date(2012, 3, 10 + i)
+ path = daily_payloads.build_daily_storage_path(
+ self.collection, access_date
+ )
+ resolved = self._write_payload_file(path)
+ self._set_file_age(resolved, 30)
+ paths.append(resolved)
+
+ self._create_job(
+ collection=self.collection,
+ access_date=access_date,
+ status=status,
+ storage_path=path.as_posix(),
+ payload_hash="abc",
+ )
+
+ result = daily_payloads.cleanup_exported_payloads(older_than_days=7)
+
+ self.assertEqual(result, 0)
+ for p in paths:
+ self.assertTrue(p.exists())
+
+ def test_cleanup_filters_by_collection(self):
+ path_books = daily_payloads.build_daily_storage_path(
+ self.collection, date(2012, 3, 10)
+ )
+ path_scl = daily_payloads.build_daily_storage_path(
+ self.other_collection, date(2012, 3, 10)
+ )
+ resolved_books = self._write_payload_file(path_books)
+ resolved_scl = self._write_payload_file(path_scl)
+ self._set_file_age(resolved_books, 30)
+ self._set_file_age(resolved_scl, 30)
+
+ self._create_job(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ status=DailyMetricJob.STATUS_EXPORTED,
+ storage_path=path_books.as_posix(),
+ payload_hash="abc",
+ )
+ self._create_job(
+ collection=self.other_collection,
+ access_date=date(2012, 3, 10),
+ status=DailyMetricJob.STATUS_EXPORTED,
+ storage_path=path_scl.as_posix(),
+ payload_hash="def",
+ )
+
+ result = daily_payloads.cleanup_exported_payloads(
+ collections=["books"],
+ older_than_days=7,
+ )
+
+ self.assertEqual(result, 1)
+ self.assertFalse(resolved_books.exists())
+ self.assertTrue(resolved_scl.exists())
+
+ def test_cleanup_deletes_orphan_files(self):
+ path = daily_payloads.build_daily_storage_path(
+ self.collection, date(2012, 3, 10)
+ )
+ resolved = self._write_payload_file(path)
+ self._set_file_age(resolved, 30)
+
+ result = daily_payloads.cleanup_exported_payloads(older_than_days=7)
+
+ self.assertEqual(result, 1)
+ self.assertFalse(resolved.exists())
+
+ def test_cleanup_skips_orphan_file_with_old_db_job_not_exported(self):
+ path = daily_payloads.build_daily_storage_path(
+ self.collection, date(2012, 3, 10)
+ )
+ resolved = self._write_payload_file(path)
+ self._set_file_age(resolved, 30)
+
+ self._create_job(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ status=DailyMetricJob.STATUS_PENDING,
+ storage_path=path.as_posix(),
+ payload_hash="abc",
+ )
+
+ result = daily_payloads.cleanup_exported_payloads(older_than_days=7)
+
+ self.assertEqual(result, 0)
+ self.assertTrue(resolved.exists())
+
+ def test_cleanup_clears_db_fields_for_exported_jobs(self):
+ path = daily_payloads.build_daily_storage_path(
+ self.collection, date(2012, 3, 10)
+ )
+ resolved = self._write_payload_file(path)
+ self._set_file_age(resolved, 30)
+
+ job = self._create_job(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ status=DailyMetricJob.STATUS_EXPORTED,
+ storage_path=path.as_posix(),
+ payload_hash="abc",
+ )
+
+ daily_payloads.cleanup_exported_payloads(older_than_days=7)
+
+ job.refresh_from_db()
+ self.assertEqual(job.storage_path, "")
+ self.assertEqual(job.payload_hash, "")
+
+ def test_cleanup_with_no_matching_files(self):
+ result = daily_payloads.cleanup_exported_payloads(older_than_days=7)
+ self.assertEqual(result, 0)
+
+ def test_cleanup_without_older_than_days_deletes_all(self):
+ path = daily_payloads.build_daily_storage_path(
+ self.collection, date(2012, 3, 10)
+ )
+ resolved = self._write_payload_file(path)
+
+ self._create_job(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ status=DailyMetricJob.STATUS_EXPORTED,
+ storage_path=path.as_posix(),
+ payload_hash="abc",
+ )
+
+ result = daily_payloads.cleanup_exported_payloads(older_than_days=0)
+
+ self.assertEqual(result, 1)
+ self.assertFalse(resolved.exists())
+
+
+class CleanupTaskTests(TestCase):
+ def setUp(self):
+ self.collection = Collection.objects.create(acron3="books", acron2="bk")
+
+ def test_task_cleanup_daily_payloads_calls_service(self):
+ with patch("metrics.services.daily_payloads.cleanup_exported_payloads") as mock_cleanup:
+ mock_cleanup.return_value = 5
+ from metrics.tasks import task_cleanup_daily_payloads
+
+ result = task_cleanup_daily_payloads.run(
+ collections=["books"],
+ older_than_days=7,
+ )
+
+ mock_cleanup.assert_called_once_with(
+ collections=["books"],
+ older_than_days=7,
+ )
+ self.assertEqual(result, {"deleted_payloads": 5})
+
+ def test_task_cleanup_with_defaults(self):
+ with patch("metrics.services.daily_payloads.cleanup_exported_payloads") as mock_cleanup:
+ mock_cleanup.return_value = 0
+ from metrics.tasks import task_cleanup_daily_payloads
+
+ result = task_cleanup_daily_payloads.run()
+
+ mock_cleanup.assert_called_once_with(
+ collections=[],
+ older_than_days=7,
+ )
+ self.assertEqual(result, {"deleted_payloads": 0})
diff --git a/metrics/tests/test_daily_jobs.py b/metrics/tests/test_daily_jobs.py
new file mode 100644
index 0000000..f31b410
--- /dev/null
+++ b/metrics/tests/test_daily_jobs.py
@@ -0,0 +1,162 @@
+from datetime import date, timedelta
+
+from django.contrib.auth import get_user_model
+from django.test import TestCase
+from django.utils import timezone
+from scielo_usage_counter.values import CONTENT_TYPE_FULL_TEXT, MEDIA_FORMAT_HTML
+
+from collection.models import Collection
+from log_manager import choices
+from log_manager.models import LogFile
+from metrics.models import DailyMetricJob
+from metrics import services
+
+
+class DailyMetricJobServiceTests(TestCase):
+ def setUp(self):
+ self.collection = Collection.objects.create(acron3="books", acron2="bk")
+
+ def _log_file(self, hash_value, status=choices.LOG_FILE_STATUS_QUEUED):
+ return LogFile.objects.create(
+ hash=hash_value,
+ path=f"/tmp/{hash_value}.log.gz",
+ stat_result={},
+ status=status,
+ collection=self.collection,
+ validation={"probably_date": "2012-03-10"},
+ )
+
+ def test_create_or_update_blocks_implicit_recompute_after_export(self):
+ first = self._log_file("1" * 32)
+ second = self._log_file("2" * 32)
+ DailyMetricJob.objects.create(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ status=DailyMetricJob.STATUS_EXPORTED,
+ input_log_hashes=[first.hash],
+ storage_path="books/2012/03/2012-03-10.json",
+ payload_hash="abc",
+ )
+
+ with self.assertRaises(RuntimeError):
+ services.create_or_update_daily_metric_job(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ log_files=[first, second],
+ )
+
+ def test_create_or_update_keeps_payload_for_export_retry(self):
+ log_file = self._log_file("1" * 32, status=choices.LOG_FILE_STATUS_ERROR)
+ job = DailyMetricJob.objects.create(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ status=DailyMetricJob.STATUS_ERROR,
+ input_log_hashes=[log_file.hash],
+ storage_path="books/2012/03/2012-03-10.json",
+ payload_hash="abc",
+ summary={"month_document_count": 1},
+ )
+
+ services.create_or_update_daily_metric_job(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ log_files=[log_file],
+ )
+
+ job.refresh_from_db()
+ self.assertEqual(job.status, DailyMetricJob.STATUS_PENDING)
+ self.assertEqual(job.storage_path, "books/2012/03/2012-03-10.json")
+ self.assertEqual(job.payload_hash, "abc")
+ self.assertEqual(job.summary, {"month_document_count": 1})
+
+ def test_create_or_update_clears_stale_payload_when_inputs_change_before_success(self):
+ first = self._log_file("1" * 32)
+ second = self._log_file("2" * 32)
+ job = DailyMetricJob.objects.create(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ status=DailyMetricJob.STATUS_ERROR,
+ input_log_hashes=[first.hash],
+ storage_path="books/2012/03/2012-03-10.json",
+ payload_hash="abc",
+ summary={"month_document_count": 1},
+ )
+
+ services.create_or_update_daily_metric_job(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ log_files=[first, second],
+ )
+
+ job.refresh_from_db()
+ self.assertEqual(job.input_log_hashes, sorted([first.hash, second.hash]))
+ self.assertEqual(job.storage_path, "")
+ self.assertEqual(job.payload_hash, "")
+ self.assertEqual(job.summary, {})
+
+ def test_release_stale_daily_metric_jobs_marks_logs_for_retry(self):
+ log_file = self._log_file("1" * 32, status=choices.LOG_FILE_STATUS_PARSING)
+ DailyMetricJob.objects.create(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ status=DailyMetricJob.STATUS_EXPORTING,
+ input_log_hashes=[log_file.hash],
+ export_started_at=timezone.now() - timedelta(minutes=120),
+ )
+
+ released = services.release_stale_daily_metric_jobs(stale_after_minutes=60)
+
+ log_file.refresh_from_db()
+ self.assertEqual(released, 1)
+ self.assertEqual(log_file.status, choices.LOG_FILE_STATUS_ERROR)
+ self.assertIsNone(log_file.parse_heartbeat_at)
+
+ def test_process_line_discards_invalid_local_datetime_without_raising(self):
+ class FakeUtm:
+ def translate(self, url):
+ return {
+ "book_id": "q7gtd",
+ "pid_generic": "book:q7gtd",
+ "media_language": "en",
+ "media_format": MEDIA_FORMAT_HTML,
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ }
+
+ log_file = self._log_file("1" * 32)
+ results = {}
+
+ is_valid, error = services.process_line(
+ results=results,
+ line={
+ "url": "/id/q7gtd",
+ "client_name": "browser",
+ "client_version": "1.0",
+ "ip_address": "127.0.0.1",
+ "country_code": "BR",
+ "local_datetime": None,
+ },
+ utm=FakeUtm(),
+ log_file=log_file,
+ )
+
+ self.assertFalse(is_valid)
+ self.assertIsNone(error)
+ self.assertEqual(results, {})
+
+ def test_mark_daily_metric_job_exported_records_updated_by(self):
+ user = get_user_model().objects.create_user(
+ username="tester",
+ email="tester@example.org",
+ password="secret",
+ )
+ job = DailyMetricJob.objects.create(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ status=DailyMetricJob.STATUS_EXPORTING,
+ )
+
+ services.mark_daily_metric_job_exported(job, user=user)
+
+ job.refresh_from_db()
+ self.assertEqual(job.status, DailyMetricJob.STATUS_EXPORTED)
+ self.assertIsNotNone(job.exported_at)
diff --git a/metrics/tests/test_index_utils.py b/metrics/tests/test_index_utils.py
index 47f1a0e..562fc42 100644
--- a/metrics/tests/test_index_utils.py
+++ b/metrics/tests/test_index_utils.py
@@ -1,104 +1,894 @@
+import csv
import unittest
+from datetime import datetime
+from pathlib import Path
+from tempfile import TemporaryDirectory
from scielo_usage_counter.values import (
- MEDIA_FORMAT_UNDEFINED,
- MEDIA_FORMAT_PDF,
- MEDIA_FORMAT_HTML,
- CONTENT_TYPE_UNDEFINED,
- CONTENT_TYPE_FULL_TEXT,
CONTENT_TYPE_ABSTRACT,
+ CONTENT_TYPE_FULL_TEXT,
+ CONTENT_TYPE_UNDEFINED,
DEFAULT_SCIELO_ISSN,
+ MEDIA_FORMAT_HTML,
+ MEDIA_FORMAT_PDF,
+ MEDIA_FORMAT_UNDEFINED,
)
-from metrics.utils import index_utils
+from metrics.counter import access, documents as index_docs
+from metrics.opensearch.names import generate_month_index_name, generate_year_index_name
-class TestIndexUtils(unittest.TestCase):
+class TestIndexUtils(unittest.TestCase):
def test_is_valid_item_access_data_valid(self):
data = {
- 'scielo_issn': '1234-5678',
- 'pid_v2': 'S0102-67202020000100001',
- 'pid_v3': 'jGJccQ7bFdbz6wy3nfXGVdv',
- 'media_format': MEDIA_FORMAT_PDF,
- 'content_type': CONTENT_TYPE_FULL_TEXT,
+ "scielo_issn": "1234-5678",
+ "pid_v2": "S0102-67202020000100001",
+ "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv",
+ "media_language": "en",
+ "media_format": MEDIA_FORMAT_PDF,
+ "content_type": CONTENT_TYPE_FULL_TEXT,
}
- result, _ = index_utils.is_valid_item_access_data(data)
+ result, _ = access.is_valid_item_access_data(data)
self.assertTrue(result)
def test_is_valid_item_access_data_missing_scielo_issn(self):
data = {
- 'scielo_issn': '',
- 'pid_v2': 'S0102-67202020000100001',
- 'pid_v3': 'jGJccQ7bFdbz6wy3nfXGVdv',
- 'media_format': MEDIA_FORMAT_PDF,
- 'content_type': CONTENT_TYPE_FULL_TEXT,
+ "scielo_issn": "",
+ "pid_v2": "S0102-67202020000100001",
+ "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv",
+ "media_language": "en",
+ "media_format": MEDIA_FORMAT_PDF,
+ "content_type": CONTENT_TYPE_FULL_TEXT,
}
- result, _ = index_utils.is_valid_item_access_data(data)
+ result, _ = access.is_valid_item_access_data(data)
self.assertFalse(result)
+ def test_is_valid_item_access_data_valid_book_source(self):
+ data = {
+ "source_type": "book",
+ "source_id": "q7gtd",
+ "scielo_issn": DEFAULT_SCIELO_ISSN,
+ "pid_generic": "BOOK:Q7GTD",
+ "media_language": "en",
+ "media_format": MEDIA_FORMAT_HTML,
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ }
+ result, _ = access.is_valid_item_access_data(data)
+ self.assertTrue(result)
+
def test_is_valid_item_access_data_undefined_media_format(self):
data = {
- 'scielo_issn': '1234-5678',
- 'pid_v2': 'S0102-67202020000100001',
- 'pid_v3': 'jGJccQ7bFdbz6wy3nfXGVdv',
- 'media_format': MEDIA_FORMAT_UNDEFINED,
- 'content_type': CONTENT_TYPE_FULL_TEXT,
+ "scielo_issn": "1234-5678",
+ "pid_v2": "S0102-67202020000100001",
+ "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv",
+ "media_language": "en",
+ "media_format": MEDIA_FORMAT_UNDEFINED,
+ "content_type": CONTENT_TYPE_FULL_TEXT,
}
- result, _ = index_utils.is_valid_item_access_data(data)
+ result, _ = access.is_valid_item_access_data(data)
self.assertFalse(result)
def test_is_valid_item_access_data_undefined_content_type(self):
data = {
- 'scielo_issn': '1234-5678',
- 'pid_v2': 'S0102-67202020000100001',
- 'pid_v3': 'jGJccQ7bFdbz6wy3nfXGVdv',
- 'media_format': MEDIA_FORMAT_PDF,
- 'content_type': CONTENT_TYPE_UNDEFINED,
+ "scielo_issn": "1234-5678",
+ "pid_v2": "S0102-67202020000100001",
+ "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv",
+ "media_language": "en",
+ "media_format": MEDIA_FORMAT_PDF,
+ "content_type": CONTENT_TYPE_UNDEFINED,
}
- result, _ = index_utils.is_valid_item_access_data(data)
+ result, _ = access.is_valid_item_access_data(data)
self.assertFalse(result)
def test_is_valid_item_access_data_missing_pid_v2_and_pid_v3(self):
data = {
- 'scielo_issn': '1234-5678',
- 'pid_v2': '',
- 'pid_v3': '',
- 'media_format': MEDIA_FORMAT_PDF,
- 'content_type': CONTENT_TYPE_FULL_TEXT,
+ "scielo_issn": "1234-5678",
+ "pid_v2": "",
+ "pid_v3": "",
+ "media_language": "en",
+ "media_format": MEDIA_FORMAT_PDF,
+ "content_type": CONTENT_TYPE_FULL_TEXT,
}
- result, _ = index_utils.is_valid_item_access_data(data)
+ result, _ = access.is_valid_item_access_data(data)
self.assertFalse(result)
def test_is_valid_item_access_data_media_format_html(self):
data = {
- 'scielo_issn': '1234-5678',
- 'pid_v2': 'S0102-67202020000100001',
- 'pid_v3': 'jGJccQ7bFdbz6wy3nfXGVdv',
- 'media_format': MEDIA_FORMAT_HTML,
- 'content_type': CONTENT_TYPE_FULL_TEXT,
+ "scielo_issn": "1234-5678",
+ "pid_v2": "S0102-67202020000100001",
+ "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv",
+ "media_language": "en",
+ "media_format": MEDIA_FORMAT_HTML,
+ "content_type": CONTENT_TYPE_FULL_TEXT,
}
- result, _ = index_utils.is_valid_item_access_data(data)
+ result, _ = access.is_valid_item_access_data(data)
self.assertTrue(result)
def test_is_valid_item_access_data_content_type_abstract(self):
data = {
- 'scielo_issn': '1234-5678',
- 'pid_v2': 'S0102-67202020000100001',
- 'pid_v3': 'jGJccQ7bFdbz6wy3nfXGVdv',
- 'media_format': MEDIA_FORMAT_PDF,
- 'content_type': CONTENT_TYPE_ABSTRACT
+ "scielo_issn": "1234-5678",
+ "pid_v2": "S0102-67202020000100001",
+ "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv",
+ "media_language": "en",
+ "media_format": MEDIA_FORMAT_PDF,
+ "content_type": CONTENT_TYPE_ABSTRACT,
}
- result, _ = index_utils.is_valid_item_access_data(data)
+ result, _ = access.is_valid_item_access_data(data)
self.assertTrue(result)
- def test_is_valid_item_acess_data_dataverse(self):
+ def test_is_valid_item_access_data_dataset_without_source_or_language_is_valid(self):
data = {
- 'scielo_issn': DEFAULT_SCIELO_ISSN,
- 'pid_v2': None,
- 'pid_v3': None,
- 'pid_generic': 'DOI:10.48331/SCIELODATA.JLMAIY',
- 'media_format': MEDIA_FORMAT_HTML,
- 'content_type': CONTENT_TYPE_ABSTRACT,
+ "document_type": "dataset",
+ "scielo_issn": DEFAULT_SCIELO_ISSN,
+ "pid_v2": None,
+ "pid_v3": None,
+ "pid_generic": "DOI:10.48331/SCIELODATA.JLMAIY",
+ "media_language": "un",
+ "media_format": MEDIA_FORMAT_HTML,
+ "content_type": CONTENT_TYPE_ABSTRACT,
}
- result, _ = index_utils.is_valid_item_access_data(data)
+ result, _ = access.is_valid_item_access_data(data)
self.assertTrue(result)
+
+ def test_is_valid_item_access_data_missing_media_language_is_invalid(self):
+ data = {
+ "scielo_issn": "1234-5678",
+ "pid_v2": "S0102-67202020000100001",
+ "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv",
+ "media_language": "",
+ "media_format": MEDIA_FORMAT_PDF,
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ }
+ result, _ = access.is_valid_item_access_data(data)
+ self.assertFalse(result)
+
+ def test_extract_item_access_data_normalizes_source_fields_for_journal(self):
+ data = access.extract_item_access_data(
+ "scl",
+ {
+ "scielo_issn": "1234-5678",
+ "pid_v2": "S0102-67202020000100001",
+ "media_language": "en",
+ "media_format": MEDIA_FORMAT_PDF,
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ "publication_year": "2024",
+ "journal_main_title": "Journal Title",
+ "journal_subject_area_capes": ["Health Sciences"],
+ "journal_subject_area_wos": ["Medicine"],
+ "journal_acronym": "testjou",
+ "journal_publisher_name": ["SciELO"],
+ },
+ )
+
+ self.assertEqual(data["source_type"], "journal")
+ self.assertEqual(data["source_id"], "1234-5678")
+ self.assertEqual(data["source_main_title"], "Journal Title")
+ self.assertEqual(data["source_acronym"], "testjou")
+
+ def test_extract_item_access_data_normalizes_source_fields_for_books(self):
+ data = access.extract_item_access_data(
+ "books",
+ {
+ "book_id": "q7gtd",
+ "book_title": "Book Title",
+ "title_pid_generic": "book:q7gtd",
+ "pid_generic": "book:q7gtd/chapter:03",
+ "media_language": "en",
+ "media_format": MEDIA_FORMAT_HTML,
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ "publication_year": "2023",
+ },
+ )
+
+ self.assertEqual(data["source_type"], "book")
+ self.assertEqual(data["source_id"], "q7gtd")
+ self.assertEqual(data["scielo_issn"], DEFAULT_SCIELO_ISSN)
+ self.assertEqual(data["source_main_title"], "Book Title")
+ self.assertEqual(data["title_pid_generic"], "BOOK:Q7GTD")
+
+ def test_extract_item_access_data_preserves_access_url_and_free_to_read(self):
+ data = access.extract_item_access_data(
+ "books",
+ {
+ "book_id": "c2248",
+ "book_title": "Book Title",
+ "title_pid_generic": "book:c2248",
+ "pid_generic": "book:c2248",
+ "media_language": "pt",
+ "media_format": MEDIA_FORMAT_PDF,
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ "access_url": "/id/c2248/pdf/freitas-9788599662830.pdf",
+ "source_access_type": "free_to_read",
+ },
+ )
+
+ self.assertEqual(data["access_url"], "/id/c2248/pdf/freitas-9788599662830.pdf")
+ self.assertEqual(data["counter_access_type"], "Free_To_Read")
+
+ def test_extract_item_access_data_tolerates_malformed_media_language(self):
+ data = access.extract_item_access_data(
+ "books",
+ {
+ "book_id": "q7gtd",
+ "pid_generic": "book:q7gtd",
+ "media_language": "'",
+ "media_format": MEDIA_FORMAT_HTML,
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ },
+ )
+
+ self.assertEqual(data["media_language"], "un")
+
+ def test_extract_item_access_data_normalizes_scielo_collection_document_types(self):
+ preprint = access.extract_item_access_data(
+ "preprints",
+ {
+ "pid_generic": "10.1590/SciELOPreprints.1234",
+ "media_format": MEDIA_FORMAT_HTML,
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ },
+ )
+ dataset = access.extract_item_access_data(
+ "data",
+ {
+ "pid_generic": "10.48331/scielodata.abc123",
+ "media_format": MEDIA_FORMAT_HTML,
+ "content_type": CONTENT_TYPE_ABSTRACT,
+ },
+ )
+ article = access.extract_item_access_data(
+ "scl",
+ {
+ "scielo_issn": "1234-5678",
+ "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv",
+ "media_format": MEDIA_FORMAT_HTML,
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ },
+ )
+
+ self.assertEqual(preprint["source_type"], "preprint_server")
+ self.assertEqual(preprint["document_type"], "preprint")
+ self.assertEqual(dataset["source_type"], "data_repository")
+ self.assertEqual(dataset["document_type"], "dataset")
+ self.assertEqual(article["source_type"], "journal")
+ self.assertEqual(article["document_type"], "article")
+
+ def test_update_results_with_item_access_data_stores_source_and_periods(self):
+ results = {}
+ item_access_data = {
+ "collection": "books",
+ "source_type": "book",
+ "source_id": "q7gtd",
+ "scielo_issn": DEFAULT_SCIELO_ISSN,
+ "pid_v2": None,
+ "pid_v3": None,
+ "pid_generic": "BOOK:Q7GTD",
+ "title_pid_generic": "BOOK:Q7GTD",
+ "media_language": "en",
+ "media_format": MEDIA_FORMAT_HTML,
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ "publication_year": "2023",
+ "source_main_title": "Book Title",
+ "source_subject_area_capes": [],
+ "source_subject_area_wos": [],
+ "source_acronym": None,
+ "source_publisher_name": ["SciELO Books"],
+ }
+ line = {
+ "client_name": "browser",
+ "client_version": "1.0",
+ "ip_address": "127.0.0.1",
+ "country_code": "BR",
+ "local_datetime": datetime(2024, 1, 15, 10, 0, 5),
+ }
+
+ access.update_results_with_item_access_data(results, item_access_data, line)
+
+ self.assertEqual(len(results), 1)
+ result = next(iter(results.values()))
+ self.assertEqual(result["source"]["source_type"], "book")
+ self.assertEqual(result["source"]["source_id"], "q7gtd")
+ self.assertEqual(result["source"]["main_title"], "Book Title")
+ self.assertEqual(result["access_date"], "2024-01-15")
+ self.assertEqual(result["access_month"], "202401")
+ self.assertEqual(result["access_year"], "2024")
+ self.assertEqual(result["access_country_code"], "BR")
+ self.assertEqual(result["content_language"], "en")
+ self.assertEqual(result["title_pid_generic"], "BOOK:Q7GTD")
+ self.assertIn("user_session_id", result)
+
+ def test_update_results_with_item_access_data_rejects_invalid_local_datetime(self):
+ results = {}
+ item_access_data = {
+ "collection": "books",
+ "source_type": "book",
+ "source_id": "q7gtd",
+ "scielo_issn": DEFAULT_SCIELO_ISSN,
+ "pid_generic": "BOOK:Q7GTD",
+ "media_language": "en",
+ "media_format": MEDIA_FORMAT_HTML,
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ }
+ line = {
+ "client_name": "browser",
+ "client_version": "1.0",
+ "ip_address": "127.0.0.1",
+ "country_code": "BR",
+ "local_datetime": None,
+ }
+
+ with self.assertRaises(ValueError):
+ access.update_results_with_item_access_data(results, item_access_data, line)
+
+ self.assertEqual(results, {})
+
+ def test_update_results_with_item_access_data_does_not_expand_book_into_segments(self):
+ results = {}
+ item_access_data = {
+ "collection": "books",
+ "source_type": "book",
+ "source_id": "c2248",
+ "scielo_issn": DEFAULT_SCIELO_ISSN,
+ "pid_v2": None,
+ "pid_v3": None,
+ "pid_generic": "BOOK:C2248",
+ "title_pid_generic": "BOOK:C2248",
+ "segment_pid_generics": [
+ "BOOK:C2248/CHAPTER:00",
+ "BOOK:C2248/CHAPTER:01",
+ "BOOK:C2248/CHAPTER:02",
+ ],
+ "media_language": "pt",
+ "media_format": MEDIA_FORMAT_PDF,
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ "publication_year": "2018",
+ "source_main_title": "C2248 Book",
+ }
+ line = {
+ "client_name": "browser",
+ "client_version": "1.0",
+ "ip_address": "127.0.0.1",
+ "country_code": "BR",
+ "local_datetime": datetime(2024, 1, 15, 10, 0, 5),
+ }
+
+ access.update_results_with_item_access_data(results, item_access_data, line)
+
+ self.assertEqual(len(results), 1)
+ result = list(results.values())[0]
+ self.assertEqual(result["pid_generic"], "BOOK:C2248")
+
+ def test_double_click_filter_uses_url_bucket_for_same_item(self):
+ results = {}
+ item_access_data = {
+ "collection": "books",
+ "source_type": "book",
+ "source_id": "c2248",
+ "scielo_issn": DEFAULT_SCIELO_ISSN,
+ "pid_v2": None,
+ "pid_v3": None,
+ "pid_generic": "BOOK:C2248/CHAPTER:03",
+ "title_pid_generic": "BOOK:C2248",
+ "media_language": "pt",
+ "media_format": MEDIA_FORMAT_HTML,
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ "publication_year": "2018",
+ "source_main_title": "C2248 Book",
+ }
+ base_line = {
+ "client_name": "browser",
+ "client_version": "1.0",
+ "ip_address": "127.0.0.1",
+ "country_code": "BR",
+ }
+
+ access.update_results_with_item_access_data(
+ results,
+ item_access_data,
+ {
+ **base_line,
+ "local_datetime": datetime(2024, 1, 15, 10, 0, 5),
+ "url": "/id/c2248/03",
+ },
+ )
+ access.update_results_with_item_access_data(
+ results,
+ item_access_data,
+ {
+ **base_line,
+ "local_datetime": datetime(2024, 1, 15, 10, 0, 20),
+ "url": "https://books.scielo.org/id/c2248/epub/03.html?x=1",
+ },
+ )
+
+ raw = next(iter(results.values()))
+ self.assertEqual(
+ set(raw["click_timestamps_by_url"]),
+ {"/id/c2248/03", "/id/c2248/epub/03.html"},
+ )
+
+ metrics_data = index_docs.convert_raw_results_to_index_documents(results)
+ month_item = metrics_data["month"]["books|c2248|||BOOK:C2248/CHAPTER:03|2024-01|Open|Regular|2018"]
+
+ self.assertEqual(month_item["total_requests"], 2)
+ self.assertEqual(month_item["unique_requests"], 1)
+
+ def test_double_click_filter_collapses_same_url_within_30_seconds(self):
+ results = {}
+ item_access_data = {
+ "collection": "books",
+ "source_type": "book",
+ "source_id": "c2248",
+ "scielo_issn": DEFAULT_SCIELO_ISSN,
+ "pid_v2": None,
+ "pid_v3": None,
+ "pid_generic": "BOOK:C2248/CHAPTER:03",
+ "title_pid_generic": "BOOK:C2248",
+ "media_language": "pt",
+ "media_format": MEDIA_FORMAT_HTML,
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ "publication_year": "2018",
+ "source_main_title": "C2248 Book",
+ }
+ base_line = {
+ "client_name": "browser",
+ "client_version": "1.0",
+ "ip_address": "127.0.0.1",
+ "country_code": "BR",
+ "url": "/id/c2248/03?from=search",
+ }
+
+ access.update_results_with_item_access_data(
+ results,
+ item_access_data,
+ {**base_line, "local_datetime": datetime(2024, 1, 15, 10, 0, 5)},
+ )
+ access.update_results_with_item_access_data(
+ results,
+ item_access_data,
+ {**base_line, "local_datetime": datetime(2024, 1, 15, 10, 0, 20)},
+ )
+
+ raw = next(iter(results.values()))
+ self.assertEqual(
+ raw["click_timestamps_by_url"],
+ {"/id/c2248/03": {"00:05": 1, "00:20": 1}},
+ )
+
+ metrics_data = index_docs.convert_raw_results_to_index_documents(results)
+ month_item = metrics_data["month"]["books|c2248|||BOOK:C2248/CHAPTER:03|2024-01|Open|Regular|2018"]
+
+ self.assertEqual(month_item["total_requests"], 1)
+ self.assertEqual(month_item["unique_requests"], 1)
+
+ def test_generate_index_names_for_year_and_month(self):
+ self.assertEqual(
+ generate_year_index_name("usage", "scl", "2024-01-15"),
+ "usage_yearly_scl_2024",
+ )
+ self.assertEqual(
+ generate_month_index_name("usage", "scl", "2024-01-15"),
+ "usage_monthly_scl_2024",
+ )
+ self.assertEqual(
+ generate_year_index_name("usage", "books", "2024-01-15"),
+ "usage_yearly_books",
+ )
+ self.assertEqual(
+ generate_month_index_name("usage", "books", "2024-01-15"),
+ "usage_monthly_books",
+ )
+
+ def test_convert_raw_results_to_index_documents_creates_month_and_year_views(self):
+ data = {
+ "books|q7gtd|||BOOK:Q7GTD/CHAPTER:03|browser|1.0|127.0.0.1|BR|en|html|full_text": {
+ "collection": "books",
+ "source_key": "q7gtd",
+ "document_type": "chapter",
+ "pid_v2": None,
+ "pid_v3": None,
+ "pid_generic": "BOOK:Q7GTD/CHAPTER:03",
+ "title_pid_generic": "BOOK:Q7GTD",
+ "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10",
+ "click_timestamps": {"00:05": 1},
+ "access_country_code": "BR",
+ "content_language": "en",
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ "access_date": "2024-01-15",
+ "access_month": "202401",
+ "access_year": "2024",
+ "source": {
+ "source_type": "book",
+ "source_id": "q7gtd",
+ "scielo_issn": DEFAULT_SCIELO_ISSN,
+ "main_title": "Book Title",
+ "identifiers": {
+ "book_id": "q7gtd",
+ "isbn": "9788578791889",
+ },
+ "city": "Sao Paulo",
+ "country": "BR",
+ "subject_area_capes": [],
+ "subject_area_wos": [],
+ "acronym": None,
+ "publisher_name": ["SciELO Books"],
+ },
+ "publication_year": "2023",
+ }
+ }
+
+ metrics_data = index_docs.convert_raw_results_to_index_documents(data)
+
+ self.assertEqual(set(metrics_data.keys()), {"month", "year"})
+ self.assertEqual(len(metrics_data["month"]), 2)
+ self.assertEqual(len(metrics_data["year"]), 2)
+
+ month_item = metrics_data["month"]["books|q7gtd|||BOOK:Q7GTD/CHAPTER:03|2024-01|Open|Regular|2023"]
+ self.assertEqual(month_item["access_month"], "2024-01")
+ self.assertNotIn("access_country_code", month_item)
+ self.assertNotIn("content_language", month_item)
+ self.assertEqual(month_item["document_type"], "chapter")
+ self.assertEqual(month_item["metric_scope"], "item")
+ self.assertEqual(month_item["counter_data_type"], "Book_Segment")
+ self.assertEqual(month_item["title_pid_generic"], "BOOK:Q7GTD")
+ self.assertEqual(month_item["total_requests"], 1)
+ self.assertEqual(month_item["unique_requests"], 1)
+ self.assertNotIn("scielo_issn", month_item["source"])
+ self.assertEqual(month_item["source"]["identifiers"]["book_id"], "q7gtd")
+ self.assertEqual(month_item["source"]["publisher"], ["SciELO Books"])
+
+ month_title = metrics_data["month"]["title|books|q7gtd|||BOOK:Q7GTD|2024-01|Open|Regular|2023"]
+ self.assertEqual(month_title["document_type"], "book")
+ self.assertEqual(month_title["metric_scope"], "title")
+ self.assertEqual(month_title["counter_data_type"], "Book")
+ self.assertEqual(month_title["pid_generic"], "BOOK:Q7GTD")
+ self.assertEqual(month_title["total_requests"], 1)
+ self.assertEqual(month_title["total_investigations"], 1)
+ self.assertEqual(month_title["unique_requests"], 1)
+ self.assertEqual(month_title["unique_investigations"], 1)
+
+ year_item = metrics_data["year"][
+ "books|q7gtd|||BOOK:Q7GTD/CHAPTER:03|en|BR|2024|Open|Regular|2023"
+ ]
+ self.assertEqual(year_item["access_year"], "2024")
+ self.assertEqual(year_item["access_country_code"], "BR")
+ self.assertEqual(year_item["content_language"], "en")
+ self.assertEqual(year_item["metric_scope"], "item")
+ self.assertEqual(year_item["total_requests"], 1)
+
+ year_title = metrics_data["year"][
+ "title|books|q7gtd|||BOOK:Q7GTD|en|BR|2024|Open|Regular|2023"
+ ]
+ self.assertEqual(year_title["metric_scope"], "title")
+ self.assertEqual(year_title["total_requests"], 1)
+ self.assertEqual(year_title["total_investigations"], 1)
+ self.assertEqual(year_title["unique_requests"], 1)
+ self.assertEqual(year_title["unique_investigations"], 1)
+
+ def test_convert_raw_results_to_index_documents_maps_counter_data_types(self):
+ data = {
+ "preprints|scielo-preprints|||10.1590/SCIELOPREPRINTS.1234|sess|BR|un|html|full_text": {
+ "collection": "preprints",
+ "source_key": "scielo-preprints",
+ "document_type": "preprint",
+ "pid_generic": "10.1590/SCIELOPREPRINTS.1234",
+ "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10",
+ "click_timestamps": {"00:05": 1},
+ "access_country_code": "BR",
+ "content_language": "un",
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ "access_date": "2024-01-15",
+ "access_year": "2024",
+ "source": {
+ "source_type": "preprint_server",
+ "source_id": "scielo-preprints",
+ "main_title": "SciELO Preprints",
+ },
+ "publication_year": "2024",
+ },
+ "data|scielo-data|||10.48331/SCIELODATA.ABC123|sess|BR|un|html|abstract": {
+ "collection": "data",
+ "source_key": "scielo-data",
+ "document_type": "dataset",
+ "pid_generic": "10.48331/SCIELODATA.ABC123",
+ "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10",
+ "click_timestamps": {"00:05": 1},
+ "access_country_code": "BR",
+ "content_language": "un",
+ "content_type": CONTENT_TYPE_ABSTRACT,
+ "access_date": "2024-01-15",
+ "access_year": "2024",
+ "source": {
+ "source_type": "data_repository",
+ "source_id": "scielo-data",
+ "main_title": "SciELO Data",
+ },
+ "publication_year": "2024",
+ },
+ }
+
+ metrics_data = index_docs.convert_raw_results_to_index_documents(data)
+ preprint_doc = metrics_data["month"][
+ "preprints|scielo-preprints|||10.1590/SCIELOPREPRINTS.1234|2024-01|Open|Regular|2024"
+ ]
+ dataset_doc = metrics_data["month"][
+ "data|scielo-data|||10.48331/SCIELODATA.ABC123|2024-01|Open|Regular|2024"
+ ]
+
+ self.assertEqual(preprint_doc["counter_data_type"], "Article")
+ self.assertEqual(preprint_doc["scielo_document_type"], "preprint")
+ self.assertEqual(preprint_doc["article_version"], "Preprint")
+ self.assertEqual(dataset_doc["counter_data_type"], "Dataset")
+ self.assertIsNone(dataset_doc["article_version"])
+
+ def test_convert_raw_results_to_index_documents_dedupes_book_unique_item_across_formats(self):
+ data = {
+ "books|c2248|||BOOK:C2248/CHAPTER:03|sess|BR|pt|html|full_text": {
+ "collection": "books",
+ "source_key": "c2248",
+ "document_type": "chapter",
+ "pid_v2": None,
+ "pid_v3": None,
+ "pid_generic": "BOOK:C2248/CHAPTER:03",
+ "title_pid_generic": "BOOK:C2248",
+ "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10",
+ "click_timestamps": {"00:05": 1},
+ "access_country_code": "BR",
+ "content_language": "pt",
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ "access_date": "2024-01-15",
+ "access_month": "202401",
+ "access_year": "2024",
+ "source": {
+ "source_type": "book",
+ "source_id": "c2248",
+ "main_title": "C2248 Book",
+ "identifiers": {"book_id": "c2248", "isbn": "9788599662830"},
+ "publisher_name": ["SciELO Books"],
+ },
+ "publication_year": "2018",
+ },
+ "books|c2248|||BOOK:C2248/CHAPTER:03|sess|BR|pt|pdf|full_text": {
+ "collection": "books",
+ "source_key": "c2248",
+ "document_type": "chapter",
+ "pid_v2": None,
+ "pid_v3": None,
+ "pid_generic": "BOOK:C2248/CHAPTER:03",
+ "title_pid_generic": "BOOK:C2248",
+ "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10",
+ "click_timestamps": {"00:45": 1},
+ "access_country_code": "BR",
+ "content_language": "pt",
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ "access_date": "2024-01-15",
+ "access_month": "202401",
+ "access_year": "2024",
+ "source": {
+ "source_type": "book",
+ "source_id": "c2248",
+ "main_title": "C2248 Book",
+ "identifiers": {"book_id": "c2248", "isbn": "9788599662830"},
+ "publisher_name": ["SciELO Books"],
+ },
+ "publication_year": "2018",
+ },
+ }
+
+ metrics_data = index_docs.convert_raw_results_to_index_documents(data)
+
+ month_item = metrics_data["month"]["books|c2248|||BOOK:C2248/CHAPTER:03|2024-01|Open|Regular|2018"]
+ month_title = metrics_data["month"]["title|books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018"]
+
+ self.assertEqual(month_item["total_requests"], 2)
+ self.assertEqual(month_item["total_investigations"], 2)
+ self.assertEqual(month_item["unique_requests"], 1)
+ self.assertEqual(month_item["unique_investigations"], 1)
+ self.assertEqual(month_title["unique_requests"], 1)
+ self.assertEqual(month_title["unique_investigations"], 1)
+
+ def test_convert_raw_results_to_index_documents_skips_book_landing_page_from_item_scope(self):
+ data = {
+ "books|c2248|||BOOK:C2248|sess|BR|pt|html|abstract": {
+ "collection": "books",
+ "source_key": "c2248",
+ "document_type": "book",
+ "pid_v2": None,
+ "pid_v3": None,
+ "pid_generic": "BOOK:C2248",
+ "title_pid_generic": "BOOK:C2248",
+ "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10",
+ "click_timestamps": {"00:05": 1},
+ "access_country_code": "BR",
+ "content_language": "pt",
+ "content_type": CONTENT_TYPE_ABSTRACT,
+ "access_date": "2024-01-15",
+ "access_month": "202401",
+ "access_year": "2024",
+ "source": {
+ "source_type": "book",
+ "source_id": "c2248",
+ "main_title": "C2248 Book",
+ "identifiers": {"book_id": "c2248", "isbn": "9788599662830"},
+ "publisher_name": ["SciELO Books"],
+ },
+ "publication_year": "2018",
+ },
+ }
+
+ metrics_data = index_docs.convert_raw_results_to_index_documents(data)
+
+ self.assertEqual(
+ set(metrics_data["month"].keys()),
+ {"title|books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018"},
+ )
+ self.assertEqual(
+ set(metrics_data["year"].keys()),
+ {"title|books|c2248|||BOOK:C2248|pt|BR|2024|Open|Regular|2018"},
+ )
+
+ def test_convert_raw_results_to_index_documents_counts_whole_book_without_segments_as_book_segment(self):
+ data = {
+ "books|c2248|||BOOK:C2248|sess|BR|pt|pdf|full_text": {
+ "collection": "books",
+ "source_key": "c2248",
+ "document_type": "book",
+ "pid_v2": None,
+ "pid_v3": None,
+ "pid_generic": "BOOK:C2248",
+ "title_pid_generic": "BOOK:C2248",
+ "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10",
+ "click_timestamps": {"00:05": 1},
+ "access_country_code": "BR",
+ "content_language": "pt",
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ "access_date": "2024-01-15",
+ "access_month": "202401",
+ "access_year": "2024",
+ "source": {
+ "source_type": "book",
+ "source_id": "c2248",
+ "main_title": "C2248 Book",
+ "identifiers": {"book_id": "c2248"},
+ "publisher_name": ["SciELO Books"],
+ },
+ "publication_year": "2018",
+ },
+ }
+
+ metrics_data = index_docs.convert_raw_results_to_index_documents(data)
+ month_item = metrics_data["month"]["books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018"]
+ month_title = metrics_data["month"]["title|books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018"]
+
+ self.assertEqual(month_item["counter_data_type"], "Book_Segment")
+ self.assertEqual(month_item["metric_scope"], "item")
+ self.assertEqual(month_title["counter_data_type"], "Book")
+ self.assertEqual(month_title["metric_scope"], "title")
+
+ def test_convert_raw_results_aggregates_multiple_chapters_correctly(self):
+ """Test that accessing multiple chapters creates correct title-level totals"""
+ data = {
+ "books|q7gtd|||BOOK:Q7GTD/CHAPTER:01|session1|BR|en|html|full_text": {
+ "collection": "books",
+ "source_key": "q7gtd",
+ "document_type": "chapter",
+ "pid_generic": "BOOK:Q7GTD/CHAPTER:01",
+ "title_pid_generic": "BOOK:Q7GTD",
+ "user_session_id": "session1",
+ "click_timestamps": {"00:05": 1},
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ "access_date": "2024-01-15",
+ "access_year": "2024",
+ "source": {
+ "source_type": "book",
+ "source_id": "q7gtd",
+ "scielo_issn": DEFAULT_SCIELO_ISSN,
+ "main_title": "Book Title",
+ "identifiers": {"book_id": "q7gtd"},
+ "publisher_name": ["SciELO Books"],
+ },
+ "publication_year": "2023",
+ },
+ "books|q7gtd|||BOOK:Q7GTD/CHAPTER:02|session1|BR|en|html|full_text": {
+ "collection": "books",
+ "source_key": "q7gtd",
+ "document_type": "chapter",
+ "pid_generic": "BOOK:Q7GTD/CHAPTER:02",
+ "title_pid_generic": "BOOK:Q7GTD",
+ "user_session_id": "session1", # SAME SESSION
+ "click_timestamps": {"00:10": 1},
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ "access_date": "2024-01-15",
+ "access_year": "2024",
+ "source": {
+ "source_type": "book",
+ "source_id": "q7gtd",
+ "scielo_issn": DEFAULT_SCIELO_ISSN,
+ "main_title": "Book Title",
+ "identifiers": {"book_id": "q7gtd"},
+ "publisher_name": ["SciELO Books"],
+ },
+ "publication_year": "2023",
+ },
+ }
+
+ metrics_data = index_docs.convert_raw_results_to_index_documents(data)
+
+ # Should have 2 item documents (one per chapter) + 2 title documents (month and year)
+ self.assertEqual(len(metrics_data["month"]), 3) # 2 items + 1 title
+ self.assertEqual(len(metrics_data["year"]), 3) # 2 items + 1 title
+
+ # Each item should have total=1, unique=1
+ month_item_1 = metrics_data["month"]["books|q7gtd|||BOOK:Q7GTD/CHAPTER:01|2024-01|Open|Regular|2023"]
+ self.assertEqual(month_item_1["total_requests"], 1)
+ self.assertEqual(month_item_1["unique_requests"], 1)
+
+ month_item_2 = metrics_data["month"]["books|q7gtd|||BOOK:Q7GTD/CHAPTER:02|2024-01|Open|Regular|2023"]
+ self.assertEqual(month_item_2["total_requests"], 1)
+ self.assertEqual(month_item_2["unique_requests"], 1)
+
+ # Title should have total=2 (sum of both chapters)
+ # Title unique should be 1 (same session accessed book, counted once)
+ month_title = metrics_data["month"]["title|books|q7gtd|||BOOK:Q7GTD|2024-01|Open|Regular|2023"]
+ self.assertEqual(month_title["total_requests"], 2)
+ self.assertEqual(month_title["total_investigations"], 2)
+ self.assertEqual(month_title["unique_requests"], 1)
+ self.assertEqual(month_title["unique_investigations"], 1)
+
+ def test_export_book_r51_monthly_metrics_writes_counter_title_columns(self):
+ from metrics.management.commands.export_book_r51_monthly_metrics import Command
+
+ command = Command()
+ monthly_documents = command._build_monthly_documents(
+ {
+ "books|c2248|||BOOK:C2248/CHAPTER:03|sess|BR|pt|pdf|full_text": {
+ "collection": "books",
+ "source_key": "c2248",
+ "document_type": "chapter",
+ "pid_v2": None,
+ "pid_v3": None,
+ "pid_generic": "BOOK:C2248/CHAPTER:03",
+ "title_pid_generic": "BOOK:C2248",
+ "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10",
+ "click_timestamps": {"00:05": 1},
+ "access_country_code": "BR",
+ "content_language": "pt",
+ "content_type": CONTENT_TYPE_FULL_TEXT,
+ "access_date": "2024-01-15",
+ "access_year": "2024",
+ "source": {
+ "source_type": "book",
+ "source_id": "c2248",
+ "main_title": "C2248 Book",
+ "identifiers": {"book_id": "c2248"},
+ "publisher_name": ["SciELO Books"],
+ },
+ "publication_year": "2018",
+ }
+ }
+ )
+
+ with TemporaryDirectory() as tmpdir:
+ title_path = Path(tmpdir) / "title.csv"
+ command._write_title_csv(title_path, monthly_documents["title"])
+
+ with title_path.open(newline="") as fh:
+ reader = csv.DictReader(fh)
+ rows = list(reader)
+
+ self.assertEqual(
+ reader.fieldnames,
+ [
+ "year_month",
+ "title_pid_generic",
+ "document_type",
+ "total_item_requests",
+ "total_item_investigations",
+ "unique_title_requests",
+ "unique_title_investigations",
+ ],
+ )
+ self.assertNotIn("total_title_requests", reader.fieldnames)
+ self.assertEqual(rows[0]["year_month"], "2024-01")
+ self.assertEqual(rows[0]["total_item_requests"], "1")
+ self.assertEqual(rows[0]["unique_title_requests"], "1")
diff --git a/metrics/tests/test_opensearch.py b/metrics/tests/test_opensearch.py
new file mode 100644
index 0000000..80586f9
--- /dev/null
+++ b/metrics/tests/test_opensearch.py
@@ -0,0 +1,92 @@
+from unittest import TestCase
+from unittest.mock import Mock, patch
+
+from django.test import override_settings
+
+from metrics import opensearch
+
+
+class OpenSearchUsageClientTests(TestCase):
+ @patch.object(opensearch.OpenSearchUsageClient, "get_opensearch_client")
+ def test_create_index_sends_mappings_in_request_body(self, mock_get_client):
+ mock_client = Mock()
+ mock_get_client.return_value = mock_client
+
+ client = opensearch.OpenSearchUsageClient(url="https://example.org:9200")
+ client.create_index(
+ index_name="usage_monthly_books_202506",
+ mappings=opensearch.MONTH_INDEX_MAPPINGS,
+ )
+
+ mock_client.indices.create.assert_called_once_with(
+ index="usage_monthly_books_202506",
+ body={
+ "settings": {"index": {"number_of_replicas": 0}},
+ "mappings": opensearch.MONTH_INDEX_MAPPINGS,
+ },
+ )
+
+ @override_settings(
+ OPENSEARCH_VERIFY_CERTS=True,
+ OPENSEARCH_BASIC_AUTH=None,
+ OPENSEARCH_API_KEY=None,
+ )
+ @patch("metrics.opensearch.client.OpenSearch")
+ def test_verify_certs_false_explicitly_overrides_settings(self, mock_opensearch):
+ opensearch.OpenSearchUsageClient(
+ url="https://example.org:9200",
+ verify_certs=False,
+ )
+
+ mock_opensearch.assert_called_once_with(
+ "https://example.org:9200",
+ verify_certs=False,
+ )
+
+ def test_get_index_mappings_returns_books_specific_mappings(self):
+ self.assertIs(
+ opensearch.get_index_mappings("books", "month"),
+ opensearch.BOOKS_MONTH_INDEX_MAPPINGS,
+ )
+ self.assertIs(
+ opensearch.get_index_mappings("books", "year"),
+ opensearch.BOOKS_YEAR_INDEX_MAPPINGS,
+ )
+ self.assertIn("metric_scope", opensearch.BOOKS_MONTH_INDEX_MAPPINGS["properties"])
+ self.assertIn("counter_data_type", opensearch.BOOKS_YEAR_INDEX_MAPPINGS["properties"])
+ self.assertIn("title_pid_generic", opensearch.BOOKS_YEAR_INDEX_MAPPINGS["properties"])
+ self.assertIn("applied_jobs", opensearch.BOOKS_MONTH_INDEX_MAPPINGS["properties"])
+
+ @patch("metrics.opensearch.client.helpers.bulk")
+ @patch.object(opensearch.OpenSearchUsageClient, "get_opensearch_client")
+ def test_increment_documents_for_daily_job_uses_applied_jobs(
+ self,
+ mock_get_client,
+ mock_bulk,
+ ):
+ mock_get_client.return_value = Mock()
+ client = opensearch.OpenSearchUsageClient(url="https://example.org:9200")
+
+ client.increment_documents_for_daily_job(
+ index_name="usage_monthly_books_202506",
+ documents={
+ "doc-1": {
+ "collection": "books",
+ "pid": "BOOK:WD",
+ "pid_generic": "BOOK:WD",
+ "access_date": "2025-06-03",
+ "total_requests": 3,
+ "total_investigations": 4,
+ "unique_requests": 2,
+ "unique_investigations": 3,
+ }
+ },
+ job_id="books|2025-06-03|abc123",
+ )
+
+ actions = list(mock_bulk.call_args.args[1])
+ self.assertEqual(len(actions), 1)
+ action = actions[0]
+ self.assertEqual(action["_op_type"], "update")
+ self.assertEqual(action["script"]["params"]["job_id"], "books|2025-06-03|abc123")
+ self.assertEqual(action["upsert"], {"applied_jobs": []})
diff --git a/metrics/tests/test_tasks.py b/metrics/tests/test_tasks.py
new file mode 100644
index 0000000..932944f
--- /dev/null
+++ b/metrics/tests/test_tasks.py
@@ -0,0 +1,268 @@
+from datetime import date, timedelta
+from unittest.mock import patch
+
+from django.test import TestCase
+from django.utils import timezone
+
+from collection.models import Collection
+from log_manager import choices
+from log_manager.models import LogFile
+from metrics import tasks
+from metrics.models import DailyMetricJob
+
+
+class ParseLogsTaskTests(TestCase):
+ def setUp(self):
+ self.collection = Collection.objects.create(acron3="books", acron2="bk")
+
+ def _log_file(self, hash_value, probably_date, status=choices.LOG_FILE_STATUS_QUEUED):
+ return LogFile.objects.create(
+ hash=hash_value,
+ path=f"/tmp/{hash_value}.log.gz",
+ stat_result={},
+ status=status,
+ collection=self.collection,
+ date=date.fromisoformat(probably_date),
+ validation={"probably_date": probably_date},
+ )
+
+ def test_task_parse_logs_enqueues_one_daily_job_per_collection_date(self):
+ first = self._log_file("1" * 32, "2012-03-10")
+ second = self._log_file("2" * 32, "2012-03-10")
+ third = self._log_file("3" * 32, "2012-03-15")
+
+ with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async:
+ result = tasks.task_parse_logs.run(
+ collections=["books"],
+ include_logs_with_error=False,
+ from_date="2012-03-01",
+ until_date="2012-03-31",
+ )
+
+ self.assertEqual(result["enqueued_jobs"], 2)
+ self.assertEqual(mocked_apply_async.call_count, 2)
+ jobs = list(DailyMetricJob.objects.order_by("access_date"))
+ self.assertEqual([job.access_date for job in jobs], [date(2012, 3, 10), date(2012, 3, 15)])
+ self.assertEqual(jobs[0].input_log_hashes, sorted([first.hash, second.hash]))
+ self.assertEqual(jobs[1].input_log_hashes, [third.hash])
+
+ def test_task_parse_logs_allows_queue_override_and_robots_source(self):
+ self._log_file("1" * 32, "2012-03-10")
+
+ with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async:
+ tasks.task_parse_logs.run(
+ collections=["books"],
+ include_logs_with_error=False,
+ from_date="2012-03-01",
+ until_date="2012-03-31",
+ queue_name="parse_small_mult",
+ robots_source="counter",
+ )
+
+ mocked_apply_async.assert_called_once()
+ self.assertEqual(mocked_apply_async.call_args.kwargs["queue"], "parse_small_mult")
+ self.assertEqual(mocked_apply_async.call_args.kwargs["args"][-1], "counter")
+
+ def test_task_parse_logs_skip_log_hashes_prevents_reprocessing_same_auto_run(self):
+ skipped = self._log_file("1" * 32, "2012-03-10", status=choices.LOG_FILE_STATUS_ERROR)
+ queued = self._log_file("2" * 32, "2012-03-11")
+
+ with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async:
+ result = tasks.task_parse_logs.run(
+ collections=["books"],
+ include_logs_with_error=True,
+ from_date="2012-03-01",
+ until_date="2012-03-31",
+ skip_log_hashes=[skipped.hash],
+ )
+
+ mocked_apply_async.assert_called_once()
+ job = DailyMetricJob.objects.get()
+ self.assertEqual(job.input_log_hashes, [queued.hash])
+ self.assertEqual(result["enqueued_jobs"], 1)
+
+ def test_wait_parse_logs_wave_rechecks_until_daily_jobs_complete(self):
+ job = DailyMetricJob.objects.create(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ status=DailyMetricJob.STATUS_EXPORTING,
+ )
+
+ with patch("metrics.tasks.task_wait_parse_logs_wave.apply_async") as mocked_wait_apply_async:
+ with patch("metrics.tasks.task_parse_logs.apply_async") as mocked_parse_logs_apply_async:
+ result = tasks.task_wait_parse_logs_wave.run(
+ wave_log_hashes=[job.pk],
+ collections=["books"],
+ include_logs_with_error=False,
+ max_log_files=2,
+ auto_reexecute=True,
+ )
+
+ self.assertEqual(result, {"wave_completed": False, "reexecution_enqueued": False})
+ mocked_parse_logs_apply_async.assert_not_called()
+ mocked_wait_apply_async.assert_called_once()
+
+
+class ResumeDailyMetricJobTests(TestCase):
+ def setUp(self):
+ self.collection = Collection.objects.create(acron3="books", acron2="bk")
+
+ def test_resume_log_exports_requeues_error_daily_jobs(self):
+ log_file = LogFile.objects.create(
+ hash="1" * 32,
+ path="/tmp/1.log.gz",
+ stat_result={},
+ status=choices.LOG_FILE_STATUS_ERROR,
+ collection=self.collection,
+ date=date(2012, 3, 10),
+ )
+ job = DailyMetricJob.objects.create(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ status=DailyMetricJob.STATUS_ERROR,
+ input_log_hashes=[log_file.hash],
+ )
+
+ with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async:
+ result = tasks.task_resume_log_exports.run(
+ collections=["books"],
+ from_date="2012-03-01",
+ until_date="2012-03-31",
+ queue_name="parse_small_mult",
+ )
+
+ mocked_apply_async.assert_called_once()
+ self.assertEqual(mocked_apply_async.call_args.kwargs["args"][0], job.pk)
+ self.assertEqual(mocked_apply_async.call_args.kwargs["queue"], "parse_small_mult")
+ self.assertEqual(result["resumed_logs"], 1)
+
+ def test_resume_log_exports_clears_payload_when_current_logs_change(self):
+ log_file = LogFile.objects.create(
+ hash="2" * 32,
+ path="/tmp/2.log.gz",
+ stat_result={},
+ status=choices.LOG_FILE_STATUS_QUEUED,
+ collection=self.collection,
+ date=date(2012, 3, 10),
+ )
+ job = DailyMetricJob.objects.create(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ status=DailyMetricJob.STATUS_ERROR,
+ input_log_hashes=["1" * 32],
+ storage_path="books/2012/03/2012-03-10.json",
+ payload_hash="abc",
+ summary={"month_document_count": 1},
+ )
+
+ with patch("metrics.tasks.task_process_daily_metric_job.apply_async"):
+ tasks.task_resume_log_exports.run(
+ collections=["books"],
+ from_date="2012-03-01",
+ until_date="2012-03-31",
+ )
+
+ job.refresh_from_db()
+ self.assertEqual(job.input_log_hashes, [log_file.hash])
+ self.assertEqual(job.storage_path, "")
+ self.assertEqual(job.payload_hash, "")
+ self.assertEqual(job.summary, {})
+
+ def test_resume_log_exports_preserves_payload_when_current_logs_match(self):
+ log_file = LogFile.objects.create(
+ hash="1" * 32,
+ path="/tmp/1.log.gz",
+ stat_result={},
+ status=choices.LOG_FILE_STATUS_ERROR,
+ collection=self.collection,
+ date=date(2012, 3, 10),
+ )
+ job = DailyMetricJob.objects.create(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ status=DailyMetricJob.STATUS_ERROR,
+ input_log_hashes=[log_file.hash],
+ storage_path="books/2012/03/2012-03-10.json",
+ payload_hash="abc",
+ summary={"month_document_count": 1},
+ )
+
+ with patch("metrics.tasks.task_process_daily_metric_job.apply_async"):
+ tasks.task_resume_log_exports.run(
+ collections=["books"],
+ from_date="2012-03-01",
+ until_date="2012-03-31",
+ )
+
+ job.refresh_from_db()
+ self.assertEqual(job.storage_path, "books/2012/03/2012-03-10.json")
+ self.assertEqual(job.payload_hash, "abc")
+ self.assertEqual(job.summary, {"month_document_count": 1})
+
+ def test_resume_log_exports_requeues_stored_payload_without_current_logs(self):
+ job = DailyMetricJob.objects.create(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ status=DailyMetricJob.STATUS_ERROR,
+ input_log_hashes=["1" * 32],
+ storage_path="books/2012/03/2012-03-10.json",
+ payload_hash="abc",
+ )
+
+ with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async:
+ result = tasks.task_resume_log_exports.run(
+ collections=["books"],
+ from_date="2012-03-01",
+ until_date="2012-03-31",
+ )
+
+ mocked_apply_async.assert_called_once()
+ self.assertEqual(mocked_apply_async.call_args.kwargs["args"][0], job.pk)
+ self.assertEqual(result["resumed_jobs"], 1)
+
+ def test_resume_log_exports_skips_jobs_without_logs_or_payload(self):
+ DailyMetricJob.objects.create(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ status=DailyMetricJob.STATUS_ERROR,
+ )
+
+ with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async:
+ result = tasks.task_resume_log_exports.run(
+ collections=["books"],
+ from_date="2012-03-01",
+ until_date="2012-03-31",
+ )
+
+ mocked_apply_async.assert_not_called()
+ self.assertEqual(result["resumed_jobs"], 0)
+
+ def test_resume_log_exports_releases_stale_exporting_jobs(self):
+ log_file = LogFile.objects.create(
+ hash="1" * 32,
+ path="/tmp/1.log.gz",
+ stat_result={},
+ status=choices.LOG_FILE_STATUS_ERROR,
+ collection=self.collection,
+ date=date(2012, 3, 10),
+ )
+ job = DailyMetricJob.objects.create(
+ collection=self.collection,
+ access_date=date(2012, 3, 10),
+ status=DailyMetricJob.STATUS_EXPORTING,
+ input_log_hashes=[log_file.hash],
+ export_started_at=timezone.now() - timedelta(minutes=120),
+ )
+
+ with patch("metrics.tasks.task_process_daily_metric_job.apply_async") as mocked_apply_async:
+ result = tasks.task_resume_log_exports.run(
+ collections=["books"],
+ from_date="2012-03-01",
+ until_date="2012-03-31",
+ stale_after_minutes=60,
+ )
+
+ job.refresh_from_db()
+ self.assertEqual(job.status, DailyMetricJob.STATUS_PENDING)
+ mocked_apply_async.assert_called_once()
+ self.assertEqual(result["released_stale_batches"], 1)
diff --git a/metrics/utils/__init__.py b/metrics/utils/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/metrics/utils/index_utils.py b/metrics/utils/index_utils.py
deleted file mode 100644
index 76af8c2..0000000
--- a/metrics/utils/index_utils.py
+++ /dev/null
@@ -1,331 +0,0 @@
-from scielo_usage_counter.counter import compute_r5_metrics
-from scielo_usage_counter.values import CONTENT_TYPE_UNDEFINED, MEDIA_FORMAT_UNDEFINED
-
-from core.utils import standardizer
-from core.utils.date_utils import extract_minute_second_key, truncate_datetime_to_hour
-
-
-def generate_user_session_id(client_name, client_version, ip_address, datetime, sep='|'):
- """
- Generates a user session ID based on the provided parameters.
-
- Parameters:
- client_name (str): The name of the client.
- client_version (str): The version of the client.
- ip_address (str): The IP address of the user.
- datetime (datetime): The datetime object representing the session time.
- sep (str): The separator to use in the ID. Default is '|'.
-
- Returns:
- str: A user session ID formatted as a string.
- """
- dt_year_month_day = datetime.strftime('%Y-%m-%d')
- dt_hour = datetime.strftime('%H')
-
- return sep.join([
- str(client_name),
- str(client_version),
- str(ip_address),
- str(dt_year_month_day),
- str(dt_hour),
- ])
-
-
-def generate_item_access_id(col_acron3, scielo_issn, pid_v2, pid_v3, pid_generic, user_session_id, country_code, media_language, media_format, content_type, sep='|'):
- """
- Generates an item access ID based on the provided parameters.
-
- Parameters:
- col_acron3 (str): The acronym of the collection.
- scielo_issn (str): The ISSN of the SciELO journal.
- pid_v2 (str): The PID version 2.
- pid_v3 (str): The PID version 3.
- pid_generic (str): The generic PID.
- user_session_id (str): The user session ID.
- country_code (str): The country code of the user.
- media_language (str): The language of the media.
- media_format (str): The format of the media.
- content_type (str): The type of content.
- sep (str): The separator to use in the ID. Default is '|'.
- """
- return sep.join([
- col_acron3,
- scielo_issn,
- pid_v2 or '',
- pid_v3 or '',
- pid_generic or '',
- user_session_id,
- country_code,
- media_language,
- media_format,
- content_type,
- ])
-
-
-def generate_index_name(index_prefix: str, collection: str, date: str):
- """ Generates an index name based on the provided parameters.
- Parameters:
- index_prefix (str): The prefix for the index name.
- collection (str): The collection acronym.
- date (str): The date string in 'YYYY-MM-DD' format.
- Returns:
- str: The formatted index name.
- """
- if not date or not isinstance(date, str):
- raise ValueError("Date must be a non-empty string in 'YYYY-MM-DD' format.")
-
- if not collection or not isinstance(collection, str):
- raise ValueError("Collection must be a non-empty string.")
-
- if not index_prefix or not isinstance(index_prefix, str):
- raise ValueError("Index prefix must be a non-empty string.")
-
- index_year, _, _ = date.split('-')
- return f'{index_prefix}_{collection}_{index_year}'
-
-
-def generate_index_id(collection, journal, pid_v2, pid_v3, pid_generic, media_language, country_code, date_str):
- """
- Generates a unique index key based on the provided parameters.
- This is different from the item access ID as it does not include user session, media_format, and content_type information.
- It is used for indexing purposes.
-
- Parameters:
- collection (str): The collection acronym.
- journal (str): The journal name.
- pid_v2 (str): The PID version 2.
- pid_v3 (str): The PID version 3.
- pid_generic (str): The generic PID.
- media_language (str): The media language code.
- country_code (str): The country code.
- date_str (str): The date string in 'YYYY-MM-DD' format.
-
- Returns:
- str: A unique index key formatted as a string.
- """
- return '|'.join([
- collection,
- journal,
- pid_v2 or '',
- pid_v3 or '',
- pid_generic or '',
- media_language,
- country_code,
- date_str
- ])
-
-
-def extract_item_access_data(collection_acron3:str, translated_url: dict):
- """
- Extracts item access data from the translated URL and standardizes it.
-
- Args:
- collection_acron3 (str): The acronym of the collection.
- translated_url (dict): The translated URL containing metadata.
-
- Returns:
- dict: A dictionary containing standardized item access data, or None if the data is invalid.
- """
- if not translated_url or not isinstance(translated_url, dict):
- return {}
-
- item_access_data = {
- 'collection': collection_acron3,
- 'scielo_issn': translated_url.get('scielo_issn'),
- 'pid_v2': standardizer.standardize_pid_v2(translated_url.get('pid_v2')),
- 'pid_v3': standardizer.standardize_pid_v3(translated_url.get('pid_v3')),
- 'pid_generic': standardizer.standardize_pid_generic(translated_url.get('pid_generic')),
- 'media_language': standardizer.standardize_language_code(translated_url.get('media_language')),
- 'media_format': translated_url.get('media_format'),
- 'content_type': translated_url.get('content_type'),
- 'year_of_publication': standardizer.standardize_year_of_publication(translated_url.get('year_of_publication')),
- 'journal_main_title': translated_url.get('journal_main_title'),
- 'journal_subject_area_capes': translated_url.get('journal_subject_area_capes'),
- 'journal_subject_area_wos': translated_url.get('journal_subject_area_wos'),
- 'journal_acronym': translated_url.get('journal_acronym'),
- 'journal_publisher_name': translated_url.get('journal_publisher_name'),
- }
-
- return item_access_data
-
-
-def is_valid_item_access_data(data: dict, utm=None, ignore_utm_validation=False):
- """
- Validates the item access data based on the provided parameters.
-
- Parameters:
- data (dict): A dictionary containing the following keys:
- - scielo_issn (str): The ISSN of the SciELO journal.
- - pid_v2 (str): The PID version 2 of the document.
- - pid_v3 (str): The PID version 3 of the document.
- - media_format (str): The media format of the document.
- - content_type (str): The content type of the document.
- utm: URL translation manager for converting URLs
- ignore_utm_validation (bool): If True, skips validation against the URL translation manager.
-
- Returns:
- tuple: A tuple containing a boolean indicating whether the data is valid and a message.
- If the data is valid, the first element is True and the second element is a success message.
- If the data is invalid, the first element is False and the second element is an error message.
- """
- if not isinstance(data, dict):
- return False, {'message': 'Invalid data format. Expected a dictionary.', 'code': 'invalid_format'}
-
- scielo_issn = data.get('scielo_issn')
- media_format = data.get('media_format')
- content_type = data.get('content_type')
- pid_v2 = data.get('pid_v2')
- pid_v3 = data.get('pid_v3')
- pid_generic = data.get('pid_generic')
-
- if not all([
- scielo_issn,
- media_format and media_format != MEDIA_FORMAT_UNDEFINED,
- content_type and content_type != CONTENT_TYPE_UNDEFINED,
- pid_v2 or pid_v3 or pid_generic,
- ]):
- return False, {'message': 'Missing required fields in item access data.', 'code': 'missing_fields'}
-
- # Check ISSN and PIDs validity using the URL translation manager
- if utm and not ignore_utm_validation:
- if not utm.is_valid_code(scielo_issn, utm.journals_metadata['issn_set']):
- return False, {'message': f'Invalid scielo_issn: {scielo_issn}', 'code': 'invalid_scielo_issn'}
-
- if pid_v2 and not utm.is_valid_code(pid_v2, utm.articles_metadata['pid_set']):
- return False, {'message': f'Invalid pid_v2: {pid_v2}', 'code': 'invalid_pid_v2'}
-
- if pid_v3 and not utm.is_valid_code(pid_v3, utm.articles_metadata['pid_set']):
- return False, {'message': f'Invalid pid_v3: {pid_v3}', 'code': 'invalid_pid_v3'}
-
- if pid_generic and not utm.is_valid_code(pid_generic, utm.articles_metadata['pid_set']):
- return False, {'message': f'Invalid pid_generic: {pid_generic}', 'code': 'invalid_pid_generic'}
-
- return True, {'message': 'Item access data is valid.', 'code': 'valid'}
-
-
-def update_results_with_item_access_data(results: dict, item_access_data: dict, line: dict):
- """
- Updates the item access data with the information from the log line.
-
- Args:
- data (dict): The dictionary to store item access data.
- item_access_data (dict): The item access data extracted from the translated URL.
- line (dict): The log line containing additional information.
-
- Returns:
- None.
- """
- col_acron3 = item_access_data.get('collection')
- scielo_issn = item_access_data.get('scielo_issn')
- pid_v2 = item_access_data.get('pid_v2')
- pid_v3 = item_access_data.get('pid_v3')
- pid_generic = item_access_data.get('pid_generic')
-
- media_format = item_access_data.get('media_format')
- media_language = item_access_data.get('media_language')
- content_type = item_access_data.get('content_type')
-
- client_name = line.get('client_name')
- client_version = line.get('client_version')
- local_datetime = line.get('local_datetime')
- country_code = line.get('country_code')
- ip_address = line.get('ip_address')
-
- truncated_datetime = truncate_datetime_to_hour(local_datetime)
- ms_key = extract_minute_second_key(local_datetime)
-
- user_session_id = generate_user_session_id(
- client_name,
- client_version,
- ip_address,
- truncated_datetime,
- )
-
- item_access_id = generate_item_access_id(
- user_session_id=user_session_id,
- col_acron3=col_acron3,
- scielo_issn=scielo_issn,
- pid_v2=pid_v2,
- pid_v3=pid_v3,
- pid_generic=pid_generic,
- media_language=media_language,
- country_code=country_code,
- media_format=media_format,
- content_type=content_type,
- )
-
- if item_access_id not in results:
- results[item_access_id] = {
- 'click_timestamps': {ms_key: 0},
- 'media_format': media_format,
- 'media_language': media_language,
- 'content_type': content_type,
- 'country_code': country_code,
- 'date_str': truncated_datetime.strftime('%Y-%m-%d'),
- 'date': truncated_datetime,
- 'year_of_publication': item_access_data.get('year_of_publication'),
- 'journal': {
- 'scielo_issn': item_access_data.get('scielo_issn'),
- 'main_title': item_access_data.get('journal_main_title'),
- 'subject_area_capes': item_access_data.get('journal_subject_area_capes'),
- 'subject_area_wos': item_access_data.get('journal_subject_area_wos'),
- 'acronym': item_access_data.get('journal_acronym'),
- 'publisher_name': item_access_data.get('journal_publisher_name'),
- },
- }
-
- # Check if the click timestamp for this minute-second key exists, if not, initialize it
- if ms_key not in results[item_access_id]['click_timestamps']:
- results[item_access_id]['click_timestamps'][ms_key] = 0
-
- # Increment the click timestamp count
- results[item_access_id]['click_timestamps'][ms_key] += 1
-
-
-def convert_to_index_documents(data: dict, key_sep='|'):
- """
- Converts the provided data into a format suitable for indexing metrics.
- This function processes the data dictionary, extracting relevant fields and computing metrics.
-
- Args:
- data (dict): A dictionary containing the metrics data to be processed.
-
- Returns:
- dict: A dictionary containing the processed metrics data, ready for indexing.
- """
- if not isinstance(data, dict):
- return {}
-
- metrics_data = {}
-
- for key, value in data.items():
- collection, scielo_issn, pid_v2, pid_v3, pid_generic, _, _, _, _, _, country_code, media_language, _, content_type = key.split(key_sep)
-
- document_id = generate_index_id(
- collection,
- scielo_issn,
- pid_v2,
- pid_v3,
- pid_generic,
- media_language,
- country_code,
- value.get('date_str')
- )
-
- compute_r5_metrics(
- document_id,
- metrics_data,
- collection,
- value.get('journal'),
- pid_v2,
- pid_v3,
- pid_generic,
- value.get('year_of_publication'),
- media_language,
- value.get('country_code'),
- value.get('date_str'),
- value.get('click_timestamps'),
- content_type,
- )
-
- return metrics_data
diff --git a/metrics/wagtail_hooks.py b/metrics/wagtail_hooks.py
new file mode 100644
index 0000000..94c2ffb
--- /dev/null
+++ b/metrics/wagtail_hooks.py
@@ -0,0 +1,22 @@
+from django.utils.translation import gettext_lazy as _
+from wagtail.snippets.views.snippets import SnippetViewSet
+
+from metrics.models import DailyMetricJob
+
+class DailyMetricJobSnippetViewSet(SnippetViewSet):
+ model = DailyMetricJob
+ menu_label = _("Daily Metric Jobs")
+ icon = "history"
+ menu_order = 600
+ list_display = (
+ "collection",
+ "access_date",
+ "status",
+ "input_log_count",
+ "attempts",
+ "export_started_at",
+ "exported_at",
+ "updated",
+ )
+ list_filter = ("status", "collection", "access_date")
+ search_fields = ("collection__acron3", "error_message")
diff --git a/article/management/__init__.py b/reports/__init__.py
similarity index 100%
rename from article/management/__init__.py
rename to reports/__init__.py
diff --git a/reports/apps.py b/reports/apps.py
new file mode 100644
index 0000000..119ca26
--- /dev/null
+++ b/reports/apps.py
@@ -0,0 +1,8 @@
+from django.apps import AppConfig
+from django.utils.translation import gettext_lazy as _
+
+
+class ReportsConfig(AppConfig):
+ default_auto_field = "django.db.models.BigAutoField"
+ name = "reports"
+ verbose_name = _("Reports")
diff --git a/reports/migrations/0001_initial.py b/reports/migrations/0001_initial.py
new file mode 100644
index 0000000..2a72923
--- /dev/null
+++ b/reports/migrations/0001_initial.py
@@ -0,0 +1,140 @@
+# Generated by Django 5.2.12 on 2026-05-01 15:50
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ initial = True
+
+ dependencies = [
+ ("collection", "0001_initial"),
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name="MonthlyLogReport",
+ fields=[
+ (
+ "id",
+ models.BigAutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ ("total_files", models.IntegerField(default=0)),
+ ("created_files", models.IntegerField(default=0)),
+ ("validated_files", models.IntegerField(default=0)),
+ ("invalidated_files", models.IntegerField(default=0)),
+ ("errored_files", models.IntegerField(default=0)),
+ ("lines_parsed", models.IntegerField(default=0)),
+ ("valid_lines", models.IntegerField(default=0)),
+ ("discarded_lines", models.IntegerField(default=0)),
+ ("ip_local_count", models.IntegerField(default=0)),
+ ("ip_remote_count", models.IntegerField(default=0)),
+ ("ip_unknown_count", models.IntegerField(default=0)),
+ ("generated_at", models.DateTimeField(auto_now=True)),
+ ("year", models.IntegerField(verbose_name="Year")),
+ ("month", models.IntegerField(verbose_name="Month")),
+ (
+ "collection",
+ models.ForeignKey(
+ on_delete=django.db.models.deletion.CASCADE,
+ to="collection.collection",
+ verbose_name="Collection",
+ ),
+ ),
+ ],
+ options={
+ "verbose_name": "Monthly Log Report",
+ "verbose_name_plural": "Monthly Log Reports",
+ "ordering": ["-year", "-month", "collection__acron3"],
+ "unique_together": {("collection", "year", "month")},
+ },
+ ),
+ migrations.CreateModel(
+ name="WeeklyLogReport",
+ fields=[
+ (
+ "id",
+ models.BigAutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ ("total_files", models.IntegerField(default=0)),
+ ("created_files", models.IntegerField(default=0)),
+ ("validated_files", models.IntegerField(default=0)),
+ ("invalidated_files", models.IntegerField(default=0)),
+ ("errored_files", models.IntegerField(default=0)),
+ ("lines_parsed", models.IntegerField(default=0)),
+ ("valid_lines", models.IntegerField(default=0)),
+ ("discarded_lines", models.IntegerField(default=0)),
+ ("ip_local_count", models.IntegerField(default=0)),
+ ("ip_remote_count", models.IntegerField(default=0)),
+ ("ip_unknown_count", models.IntegerField(default=0)),
+ ("generated_at", models.DateTimeField(auto_now=True)),
+ ("year", models.IntegerField(verbose_name="Year")),
+ ("week", models.IntegerField(verbose_name="ISO Week")),
+ (
+ "collection",
+ models.ForeignKey(
+ on_delete=django.db.models.deletion.CASCADE,
+ to="collection.collection",
+ verbose_name="Collection",
+ ),
+ ),
+ ],
+ options={
+ "verbose_name": "Weekly Log Report",
+ "verbose_name_plural": "Weekly Log Reports",
+ "ordering": ["-year", "-week", "collection__acron3"],
+ "unique_together": {("collection", "year", "week")},
+ },
+ ),
+ migrations.CreateModel(
+ name="YearlyLogReport",
+ fields=[
+ (
+ "id",
+ models.BigAutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ ("total_files", models.IntegerField(default=0)),
+ ("created_files", models.IntegerField(default=0)),
+ ("validated_files", models.IntegerField(default=0)),
+ ("invalidated_files", models.IntegerField(default=0)),
+ ("errored_files", models.IntegerField(default=0)),
+ ("lines_parsed", models.IntegerField(default=0)),
+ ("valid_lines", models.IntegerField(default=0)),
+ ("discarded_lines", models.IntegerField(default=0)),
+ ("ip_local_count", models.IntegerField(default=0)),
+ ("ip_remote_count", models.IntegerField(default=0)),
+ ("ip_unknown_count", models.IntegerField(default=0)),
+ ("generated_at", models.DateTimeField(auto_now=True)),
+ ("year", models.IntegerField(verbose_name="Year")),
+ (
+ "collection",
+ models.ForeignKey(
+ on_delete=django.db.models.deletion.CASCADE,
+ to="collection.collection",
+ verbose_name="Collection",
+ ),
+ ),
+ ],
+ options={
+ "verbose_name": "Yearly Log Report",
+ "verbose_name_plural": "Yearly Log Reports",
+ "ordering": ["-year", "collection__acron3"],
+ "unique_together": {("collection", "year")},
+ },
+ ),
+ ]
diff --git a/reports/migrations/0002_alter_monthlylogreport_options_and_more.py b/reports/migrations/0002_alter_monthlylogreport_options_and_more.py
new file mode 100644
index 0000000..659215c
--- /dev/null
+++ b/reports/migrations/0002_alter_monthlylogreport_options_and_more.py
@@ -0,0 +1,36 @@
+# Generated by Django 5.2.12 on 2026-05-01 22:23
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("reports", "0001_initial"),
+ ]
+
+ operations = [
+ migrations.AlterModelOptions(
+ name="monthlylogreport",
+ options={
+ "ordering": ["collection__acron3", "year", "month"],
+ "verbose_name": "Monthly Log Report",
+ "verbose_name_plural": "Monthly Log Reports",
+ },
+ ),
+ migrations.AlterModelOptions(
+ name="weeklylogreport",
+ options={
+ "ordering": ["collection__acron3", "year", "week"],
+ "verbose_name": "Weekly Log Report",
+ "verbose_name_plural": "Weekly Log Reports",
+ },
+ ),
+ migrations.AlterModelOptions(
+ name="yearlylogreport",
+ options={
+ "ordering": ["collection__acron3", "year"],
+ "verbose_name": "Yearly Log Report",
+ "verbose_name_plural": "Yearly Log Reports",
+ },
+ ),
+ ]
diff --git a/article/management/commands/__init__.py b/reports/migrations/__init__.py
similarity index 100%
rename from article/management/commands/__init__.py
rename to reports/migrations/__init__.py
diff --git a/reports/models.py b/reports/models.py
new file mode 100644
index 0000000..3af1ec8
--- /dev/null
+++ b/reports/models.py
@@ -0,0 +1,100 @@
+from django.db import models
+from django.utils.translation import gettext_lazy as _
+
+from collection.models import Collection
+
+
+class AbstractLogReport(models.Model):
+ collection = models.ForeignKey(
+ Collection,
+ on_delete=models.CASCADE,
+ verbose_name=_("Collection"),
+ )
+ total_files = models.IntegerField(default=0)
+ created_files = models.IntegerField(default=0)
+ validated_files = models.IntegerField(default=0)
+ invalidated_files = models.IntegerField(default=0)
+ errored_files = models.IntegerField(default=0)
+ lines_parsed = models.IntegerField(default=0)
+ valid_lines = models.IntegerField(default=0)
+ discarded_lines = models.IntegerField(default=0)
+ ip_local_count = models.IntegerField(default=0)
+ ip_remote_count = models.IntegerField(default=0)
+ ip_unknown_count = models.IntegerField(default=0)
+ generated_at = models.DateTimeField(auto_now=True)
+
+ class Meta:
+ abstract = True
+
+ @property
+ def pct_validated(self):
+ if not self.total_files:
+ return 0
+ return round(self.validated_files / self.total_files * 100, 1)
+ pct_validated.fget.short_description = _("% Valid Files")
+
+ @property
+ def pct_valid_lines(self):
+ if not self.lines_parsed:
+ return 0
+ return round(self.valid_lines / self.lines_parsed * 100, 1)
+ pct_valid_lines.fget.short_description = _("% Valid Lines")
+
+ @property
+ def pct_remote_ip(self):
+ total = self.ip_remote_count + self.ip_local_count
+ if not total:
+ return 0
+ return round(self.ip_remote_count / total * 100, 1)
+ pct_remote_ip.fget.short_description = _("% Remote IP")
+
+ def __str__(self):
+ return f"{self.collection.acron3} {self.period_label}"
+
+ @property
+ def period_label(self):
+ raise NotImplementedError
+
+
+class WeeklyLogReport(AbstractLogReport):
+ year = models.IntegerField(verbose_name=_("Year"))
+ week = models.IntegerField(verbose_name=_("ISO Week"))
+
+ class Meta:
+ unique_together = [("collection", "year", "week")]
+ ordering = ["collection__acron3", "year", "week"]
+ verbose_name = _("Weekly Log Report")
+ verbose_name_plural = _("Weekly Log Reports")
+
+ @property
+ def period_label(self):
+ return f"{self.year}-W{self.week:02d}"
+
+
+class MonthlyLogReport(AbstractLogReport):
+ year = models.IntegerField(verbose_name=_("Year"))
+ month = models.IntegerField(verbose_name=_("Month"))
+
+ class Meta:
+ unique_together = [("collection", "year", "month")]
+ ordering = ["collection__acron3", "year", "month"]
+ verbose_name = _("Monthly Log Report")
+ verbose_name_plural = _("Monthly Log Reports")
+
+ @property
+ def period_label(self):
+ return f"{self.year}-{self.month:02d}"
+
+
+class YearlyLogReport(AbstractLogReport):
+ year = models.IntegerField(verbose_name=_("Year"))
+
+ class Meta:
+ unique_together = [("collection", "year")]
+ ordering = ["collection__acron3", "year"]
+ verbose_name = _("Yearly Log Report")
+ verbose_name_plural = _("Yearly Log Reports")
+
+ @property
+ def period_label(self):
+ return str(self.year)
diff --git a/reports/tasks.py b/reports/tasks.py
new file mode 100644
index 0000000..69a53a1
--- /dev/null
+++ b/reports/tasks.py
@@ -0,0 +1,238 @@
+import logging
+from collections import defaultdict
+
+from django.core.mail import send_mail
+from django.conf import settings
+from django.utils.translation import gettext as _
+
+from config import celery_app
+from core.utils import date_utils
+from collection.models import Collection
+from log_manager import choices
+from log_manager.models import LogFile
+from log_manager_config import models as lmc_models
+
+from reports.models import WeeklyLogReport, MonthlyLogReport, YearlyLogReport
+
+
+def _extract_date_from_log_file(lf):
+ if lf.date:
+ return lf.date
+
+ probably_date = (lf.validation or {}).get("probably_date")
+ if isinstance(probably_date, str) and probably_date:
+ return date_utils.get_date_obj(probably_date)
+
+ try:
+ import re
+ match = re.search(r"(\d{4}-\d{2}-\d{2})", lf.path)
+ if match:
+ return date_utils.get_date_obj(match.group(1))
+ except Exception:
+ pass
+
+ return None
+
+
+@celery_app.task(bind=True, name=_("[Reports] Populate All Reports"))
+def task_populate_all_reports(self, year=None, collection_acron=None):
+ qs = LogFile.objects.select_related("collection")
+ if collection_acron:
+ qs = qs.filter(collection__acron3=collection_acron)
+ qs = qs.only(
+ "id", "collection_id", "date", "path", "status", "summary", "validation"
+ )
+
+ weekly = defaultdict(lambda: defaultdict(int))
+ monthly = defaultdict(lambda: defaultdict(int))
+ yearly = defaultdict(lambda: defaultdict(int))
+
+ for lf in qs.iterator(chunk_size=2000):
+ extracted_date = _extract_date_from_log_file(lf)
+ if not extracted_date:
+ continue
+ if year and extracted_date.year != int(year):
+ continue
+
+ iso_year, iso_week, _ = extracted_date.isocalendar()
+ yr = extracted_date.year
+ mo = extracted_date.month
+
+ for agg, key in [
+ (weekly, (lf.collection_id, iso_year, iso_week)),
+ (monthly, (lf.collection_id, yr, mo)),
+ (yearly, (lf.collection_id, yr)),
+ ]:
+ r = agg[key]
+ r["total_files"] += 1
+ st = lf.status
+ if st == "CRE":
+ r["created_files"] += 1
+ elif st in ("QUE", "PAR", "PRO"):
+ r["validated_files"] += 1
+ elif st == "INV":
+ r["invalidated_files"] += 1
+ elif st == "ERR":
+ r["errored_files"] += 1
+
+ s = lf.summary or {}
+ lp = s.get("lines_parsed", 0) or 0
+ vl = s.get("valid_lines", 0) or 0
+ r["lines_parsed"] += lp
+ r["valid_lines"] += vl
+ r["discarded_lines"] += max(lp - vl, 0)
+
+ ips = (
+ (lf.validation or {})
+ .get("content", {})
+ .get("summary", {})
+ .get("ips", {})
+ )
+ r["ip_local_count"] += ips.get("local", 0) or 0
+ r["ip_remote_count"] += ips.get("remote", 0) or 0
+ r["ip_unknown_count"] += ips.get("unknown", 0) or 0
+
+ w_count = _upsert_reports(WeeklyLogReport, weekly)
+ m_count = _upsert_reports(MonthlyLogReport, monthly)
+ y_count = _upsert_reports(YearlyLogReport, yearly)
+
+ logging.info(
+ "Reports populated: %s weekly, %s monthly, %s yearly.",
+ w_count, m_count, y_count,
+ )
+ return f"Weekly: {w_count}, Monthly: {m_count}, Yearly: {y_count}"
+
+
+def _upsert_reports(model_class, data):
+ count = 0
+ unique_fields = list(model_class._meta.unique_together[0])
+ period_fields = unique_fields[1:]
+ for key, fields in data.items():
+ coll_id = key[0]
+ period_values = key[1:]
+ lookup = {"collection_id": coll_id}
+ for idx, field_name in enumerate(period_fields):
+ lookup[field_name] = period_values[idx]
+ model_class.objects.update_or_create(defaults=fields, **lookup)
+ count += 1
+ return count
+
+
+@celery_app.task(
+ bind=True,
+ name=_("[Reports] Generate Log Report Summary (Manual)"),
+ queue="load",
+)
+def task_log_files_count_status_report(
+ self,
+ collections=None,
+ from_date=None,
+ until_date=None,
+ days_to_go_back=None,
+ user_id=None,
+ username=None,
+):
+ from_date_str, until_date_str = date_utils.get_date_range_str(
+ from_date, until_date, days_to_go_back
+ )
+ subject = _(
+ "Usage Log Report Summary "
+ f"({from_date_str} to {until_date_str})"
+ )
+
+ for collection_acron in (collections or Collection.acron3_list()):
+ try:
+ collection = Collection.objects.get(acron3=collection_acron)
+ except Collection.DoesNotExist:
+ logging.warning("Collection not found: %s", collection_acron)
+ continue
+
+ message = _build_report_message(
+ collection,
+ from_date_str,
+ until_date_str,
+ )
+
+ if not message:
+ continue
+
+ logging.info(
+ "Sending email to collection %s. Subject: %s.",
+ collection.main_name, subject,
+ )
+
+ _send_collection_email(subject, message, collection_acron)
+
+
+def _build_report_message(collection, from_date_str, until_date_str):
+ monthly = MonthlyLogReport.objects.filter(
+ collection=collection,
+ ).order_by("-year", "-month")
+
+ if not monthly.exists():
+ return ""
+
+ latest = monthly.first()
+ message = _(
+ f"Usage Log Report for {collection.acron3}\n"
+ f"Period: {from_date_str} to {until_date_str}\n\n"
+ )
+ message += _("Latest month ({latest}):\n").format(latest=latest.period_label)
+ message += (
+ f" Total files: {latest.total_files}\n"
+ f" Validated files: {latest.validated_files} ({latest.pct_validated}%)\n"
+ f" Invalidated files: {latest.invalidated_files}\n"
+ f" Errored files: {latest.errored_files}\n"
+ f" Lines parsed: {latest.lines_parsed}\n"
+ f" Valid lines: {latest.valid_lines} ({latest.pct_valid_lines}%)\n"
+ f" Discarded lines: {latest.discarded_lines}\n"
+ f" Remote IPs: {latest.ip_remote_count} ({latest.pct_remote_ip}%)\n"
+ f" Local IPs: {latest.ip_local_count}\n"
+ )
+
+ prev_month = latest
+ if len(monthly) > 1:
+ prev_month = monthly[1]
+ message += _("\nPrevious month ({prev}):\n").format(prev=prev_month.period_label)
+ message += (
+ f" Total files: {prev_month.total_files}\n"
+ f" Validated files: {prev_month.validated_files} ({prev_month.pct_validated}%)\n"
+ f" Valid lines: {prev_month.valid_lines} ({prev_month.pct_valid_lines}%)\n"
+ f" Remote IPs: {prev_month.ip_remote_count} ({prev_month.pct_remote_ip}%)\n"
+ )
+
+ if prev_month.total_files:
+ file_diff = latest.total_files - prev_month.total_files
+ line_diff = latest.lines_parsed - prev_month.lines_parsed
+ message += _("\nMonth-over-month change:\n")
+ message += f" Files: {file_diff:+d}\n"
+ message += f" Lines: {line_diff:+d}\n"
+
+ message += (
+ f"\n---\n"
+ f"This report is automatically generated by SciELO Usage.\n"
+ )
+ return message
+
+
+def _send_collection_email(subject, message, collection):
+ emails = lmc_models.CollectionEmail.objects.filter(
+ config__collection__acron3=collection, active=True
+ ).values_list("email", flat=True)
+
+ if not emails:
+ logging.error(
+ "Error. Please, add an E-mail Configuration for the collection %s.",
+ collection,
+ )
+ return
+
+ try:
+ send_mail(
+ subject=subject,
+ message=message,
+ from_email=settings.DEFAULT_FROM_EMAIL,
+ recipient_list=list(emails),
+ )
+ except Exception as e:
+ logging.error("Error sending log files report for %s: %s", collection, e)
diff --git a/reports/wagtail_hooks.py b/reports/wagtail_hooks.py
new file mode 100644
index 0000000..b2aeac7
--- /dev/null
+++ b/reports/wagtail_hooks.py
@@ -0,0 +1,75 @@
+from django.contrib.auth import get_user_model
+from django.utils.translation import gettext_lazy as _
+from wagtail.snippets.views.snippets import SnippetViewSet, SnippetViewSetGroup
+from wagtail.snippets.models import register_snippet
+from wagtail.permission_policies.base import BasePermissionPolicy
+
+from reports.models import WeeklyLogReport, MonthlyLogReport, YearlyLogReport
+
+
+class ReadOnlyPermissionPolicy(BasePermissionPolicy):
+ def user_has_permission(self, user, action):
+ if action in ("add", "change", "delete"):
+ return False
+ return True
+
+ def users_with_any_permission(self, actions):
+ return get_user_model().objects.filter(is_active=True)
+
+
+COMMON_LIST_DISPLAY = (
+ "total_files",
+ "pct_validated",
+ "lines_parsed",
+ "pct_valid_lines",
+ "pct_remote_ip",
+ "generated_at",
+)
+
+
+class WeeklyLogReportSnippetViewSet(SnippetViewSet):
+ model = WeeklyLogReport
+ menu_label = _("Weekly")
+ icon = "info-circle"
+ menu_order = 100
+ list_display = ("collection", "year", "week") + COMMON_LIST_DISPLAY
+ list_filter = ("collection", "year", "week")
+ search_fields = ("collection__acron3",)
+ permission_policy = ReadOnlyPermissionPolicy(WeeklyLogReport)
+
+
+class MonthlyLogReportSnippetViewSet(SnippetViewSet):
+ model = MonthlyLogReport
+ menu_label = _("Monthly")
+ icon = "info-circle"
+ menu_order = 200
+ list_display = ("collection", "year", "month") + COMMON_LIST_DISPLAY
+ list_filter = ("collection", "year", "month")
+ search_fields = ("collection__acron3",)
+ permission_policy = ReadOnlyPermissionPolicy(MonthlyLogReport)
+
+
+class YearlyLogReportSnippetViewSet(SnippetViewSet):
+ model = YearlyLogReport
+ menu_label = _("Yearly")
+ icon = "info-circle"
+ menu_order = 300
+ list_display = ("collection", "year") + COMMON_LIST_DISPLAY
+ list_filter = ("collection", "year")
+ search_fields = ("collection__acron3",)
+ permission_policy = ReadOnlyPermissionPolicy(YearlyLogReport)
+
+
+class ReportsSnippetViewSetGroup(SnippetViewSetGroup):
+ menu_name = "usage_reports"
+ menu_label = _("Reports")
+ menu_icon = "info-circle"
+ menu_order = 350
+ items = (
+ WeeklyLogReportSnippetViewSet,
+ MonthlyLogReportSnippetViewSet,
+ YearlyLogReportSnippetViewSet,
+ )
+
+
+register_snippet(ReportsSnippetViewSetGroup)
diff --git a/requirements/base.txt b/requirements/base.txt
index 6ef5fba..7b5ed61 100644
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -63,10 +63,13 @@ minio==7.2.7
reverse-geocode==1.6 # https://pypi.org/project/reverse-geocode/
# SciELO Log Validator
--e git+https://github.com/scieloorg/scielo_log_validator@0.4.0#egg=scielo_log_validator
+-e git+https://github.com/scieloorg/scielo_log_validator@2.0.0#egg=scielo_log_validator
+
+# SciELO Scholarly Data
+-e git+https://github.com/scieloorg/scielo_scholarly_data@v0.1.4#egg=scielo_scholarly_data
# SciELO Usage COUNTER
--e git+https://github.com/scieloorg/scielo_usage_counter@1.5.1#egg=scielo_usage_counter
+-e git+https://github.com/scieloorg/scielo_usage_counter@2.0.0#egg=scielo_usage_counter
# Device Detector
device-detector==0.10 # https://github.com/thinkwelltwd/device_detector
@@ -93,6 +96,6 @@ tenacity==8.3.0 # https://pypi.org/project/tenacity/
# ------------------------------------------------------------------------------
articlemetaapi==1.26.7
-# ElasticSearch
+# OpenSearch
# ------------------------------------------------------------------------------
-elasticsearch==8.18.1 # https://elasticsearch-py.readthedocs.io/en/v8.18.1/
+opensearch-py==3.1.0
diff --git a/resources/constants.py b/resources/constants.py
index feba18d..2ce64da 100644
--- a/resources/constants.py
+++ b/resources/constants.py
@@ -1,2 +1,2 @@
DEFAULT_COUNTER_ROBOTS_URL = 'https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json'
-DEFAULT_MMDB_URL = 'https://download.db-ip.com/free/dbip-country-lite-2025-02.mmdb.gz'
+DEFAULT_MMDB_URL = 'https://download.db-ip.com/free/dbip-country-lite-2026-03.mmdb.gz'
diff --git a/resources/migrations/0002_remove_mmdb_creator_remove_mmdb_updated_by_and_more.py b/resources/migrations/0002_remove_mmdb_creator_remove_mmdb_updated_by_and_more.py
new file mode 100644
index 0000000..80bb0cc
--- /dev/null
+++ b/resources/migrations/0002_remove_mmdb_creator_remove_mmdb_updated_by_and_more.py
@@ -0,0 +1,61 @@
+# Generated by Django 5.2.12 on 2026-05-01 22:23
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("resources", "0001_initial"),
+ ]
+
+ operations = [
+ migrations.RemoveField(
+ model_name="mmdb",
+ name="creator",
+ ),
+ migrations.RemoveField(
+ model_name="mmdb",
+ name="updated_by",
+ ),
+ migrations.RemoveField(
+ model_name="robotuseragent",
+ name="creator",
+ ),
+ migrations.RemoveField(
+ model_name="robotuseragent",
+ name="updated_by",
+ ),
+ migrations.AddField(
+ model_name="robotuseragent",
+ name="is_active",
+ field=models.BooleanField(
+ db_index=True, default=True, verbose_name="Active"
+ ),
+ ),
+ migrations.AddField(
+ model_name="robotuseragent",
+ name="source_counter",
+ field=models.BooleanField(
+ db_index=True, default=False, verbose_name="From Atmire/COUNTER"
+ ),
+ ),
+ migrations.AddField(
+ model_name="robotuseragent",
+ name="source_scielo",
+ field=models.BooleanField(
+ db_index=True, default=False, verbose_name="From SciELO"
+ ),
+ ),
+ migrations.AddField(
+ model_name="robotuseragent",
+ name="source_url",
+ field=models.URLField(
+ blank=True, max_length=255, null=True, verbose_name="Source URL"
+ ),
+ ),
+ migrations.AlterField(
+ model_name="robotuseragent",
+ name="last_changed",
+ field=models.DateField(blank=True, null=True, verbose_name="Last Changed"),
+ ),
+ ]
diff --git a/resources/models.py b/resources/models.py
index a30b8d3..22663e2 100644
--- a/resources/models.py
+++ b/resources/models.py
@@ -2,11 +2,26 @@
from django.db import models
from django.utils.translation import gettext_lazy as _
+from wagtail.admin.panels import FieldPanel
-from core.models import CommonControlField
+class RobotUserAgent(models.Model):
+ SOURCE_ALL = "all"
+ SOURCE_COUNTER = "counter"
+ SOURCE_SCIELO = "scielo"
+ SOURCE_CHOICES = [SOURCE_ALL, SOURCE_COUNTER, SOURCE_SCIELO]
+ panels = [
+ FieldPanel("pattern"),
+ FieldPanel("source_counter"),
+ FieldPanel("source_scielo"),
+ FieldPanel("is_active"),
+ FieldPanel("source_url"),
+ FieldPanel("last_changed"),
+ ]
+
+ created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True)
+ updated = models.DateTimeField(verbose_name=_("Last update date"), auto_now=True)
-class RobotUserAgent(CommonControlField):
pattern = models.CharField(
verbose_name=_('Pattern'),
max_length=255,
@@ -14,21 +29,77 @@ class RobotUserAgent(CommonControlField):
blank=False,
primary_key=True,
)
+ source_counter = models.BooleanField(
+ verbose_name=_("From Atmire/COUNTER"),
+ default=False,
+ db_index=True,
+ )
+ source_scielo = models.BooleanField(
+ verbose_name=_("From SciELO"),
+ default=False,
+ db_index=True,
+ )
+ is_active = models.BooleanField(
+ verbose_name=_("Active"),
+ default=True,
+ db_index=True,
+ )
+ source_url = models.URLField(
+ verbose_name=_("Source URL"),
+ max_length=255,
+ null=True,
+ blank=True,
+ )
last_changed = models.DateField(
verbose_name=_('Last Changed'),
- null=False,
- blank=False,
+ null=True,
+ blank=True,
)
@classmethod
def get_all_patterns(cls):
- return cls.objects.values_list('pattern', flat=True)
+ return cls.get_patterns(source=cls.SOURCE_ALL)
+
+ @classmethod
+ def normalize_source(cls, source=None):
+ normalized = (source or cls.SOURCE_ALL).lower()
+ if normalized not in cls.SOURCE_CHOICES:
+ raise ValueError(f"Unsupported robots source: {source}")
+ return normalized
+
+ @classmethod
+ def get_patterns(cls, source=None):
+ source = cls.normalize_source(source)
+ queryset = cls.objects.filter(is_active=True)
+
+ if source == cls.SOURCE_COUNTER:
+ queryset = queryset.filter(source_counter=True)
+ elif source == cls.SOURCE_SCIELO:
+ queryset = queryset.filter(source_scielo=True)
+
+ return queryset.values_list("pattern", flat=True)
+
+ @property
+ def source_labels(self):
+ labels = []
+ if self.source_counter:
+ labels.append("Atmire/COUNTER")
+ if self.source_scielo:
+ labels.append("SciELO")
+ return ", ".join(labels) or "-"
+
+ def save(self, *args, **kwargs):
+ if not self.source_counter and not self.source_scielo:
+ self.source_scielo = True
+ super().save(*args, **kwargs)
def __str__(self):
return self.pattern
-class MMDB(CommonControlField):
+class MMDB(models.Model):
+ created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True)
+ updated = models.DateTimeField(verbose_name=_("Last update date"), auto_now=True)
id = models.CharField(
verbose_name=_('ID (HASH)'),
max_length=64,
diff --git a/resources/tasks.py b/resources/tasks.py
index e67cea1..4df60a9 100644
--- a/resources/tasks.py
+++ b/resources/tasks.py
@@ -1,19 +1,13 @@
import logging
-from django.contrib.auth import get_user_model
-from django.utils import timezone
from django.utils.translation import gettext as _
from config import celery_app
-from core.utils.utils import _get_user
from . import constants, models, utils
-
-User = get_user_model()
-
-@celery_app.task(bind=True, name=_('Load robots data'))
-def task_load_robots(self, url_robots=None, user_id=None, username=None):
+@celery_app.task(bind=True, name=_('[Resources] Load Robots Data'))
+def task_load_robots(self, url_robots=None):
"""
Load robots from a given URL and save them to the database.
This function fetches robot data from a specified URL (or a default URL if none is provided),
@@ -32,8 +26,6 @@ def task_load_robots(self, url_robots=None, user_id=None, username=None):
- Error if there is an issue downloading or saving the robots.
- Debug information for each robot saved.
"""
- user = _get_user(self.request, username=username, user_id=user_id)
-
if not url_robots:
url_robots = constants.DEFAULT_COUNTER_ROBOTS_URL
logging.warning(f'No robots URL provided. Using default: {url_robots}')
@@ -45,43 +37,63 @@ def task_load_robots(self, url_robots=None, user_id=None, username=None):
return False
cleaned_robots_data = utils.clean_robots_list(robots_data)
+ fetched_patterns = set()
try:
for r_str in cleaned_robots_data:
pattern = r_str.get('pattern')
last_changed = r_str.get('last_changed')
+ fetched_patterns.add(pattern)
- r_obj, created = models.RobotUserAgent.objects.get_or_create(pattern=pattern, last_changed=last_changed)
+ r_obj = models.RobotUserAgent.objects.filter(pattern=pattern).first()
+ created = r_obj is None
if created:
- r_obj.creator = user
-
- r_obj.updated = timezone.now()
- r_obj.updated_by = user
+ r_obj = models.RobotUserAgent(
+ pattern=pattern,
+ source_counter=True,
+ source_scielo=False,
+ )
+ r_obj.source_counter = True
+ r_obj.is_active = True
+ r_obj.source_url = url_robots
+ r_obj.last_changed = last_changed
r_obj.save()
logging.debug(f'Robot saved: {r_obj}')
+
+ stale_counter_patterns = models.RobotUserAgent.objects.filter(
+ source_counter=True
+ ).exclude(pattern__in=fetched_patterns)
+
+ for r_obj in stale_counter_patterns:
+ r_obj.source_counter = False
+ r_obj.source_url = None
+ r_obj.last_changed = None
+ if not r_obj.source_scielo:
+ r_obj.is_active = False
+ r_obj.save()
+ logging.debug(f'Robot deactivated or detached from COUNTER source: {r_obj}')
+
return True
except Exception as e:
logging.error(f'Error saving robots: {e}')
+ return False
-@celery_app.task(bind=True, name=_('Load geolocation and country data'))
-def task_load_geoip(self, url_geoip=None, user_id=None, username=None, validate=True):
+@celery_app.task(bind=True, name=_('[Resources] Load Geolocation Data'))
+def task_load_geoip(self, url_geoip=None, validate=True):
"""
Load GeoIP data from a specified URL, validate it, and save it to the database.
Args:
url_geoip (str, optional): The URL to download the GeoIP data from. Defaults to None.
- user_id (int, optional): The ID of the user performing the task. Defaults to None.
- username (str, optional): The username of the user performing the task. Defaults to None.
validate (bool, optional): Whether to validate the GeoIP data. Defaults to True.
Returns:
bool: True if the GeoIP data was successfully loaded and saved, False otherwise.
Raises:
Exception: If there is an error downloading, decompressing, or validating the GeoIP data.
"""
- user = _get_user(self.request, username=username, user_id=user_id)
if not url_geoip:
url_geoip = constants.DEFAULT_MMDB_URL
@@ -115,10 +127,6 @@ def task_load_geoip(self, url_geoip=None, user_id=None, username=None, validate=
except models.MMDB.DoesNotExist:
mmdb_obj = models.MMDB.objects.create(id=mmdb_hash, data=mmdb_data)
mmdb_obj.url = url_geoip or constants.DEFAULT_MMDB_URL
- mmdb_obj.creator = user
-
- mmdb_obj.updated = timezone.now()
- mmdb_obj.updated_by = user
mmdb_obj.save()
logging.debug(f'GeoIP data has been saved: {mmdb_obj}')
diff --git a/resources/tests.py b/resources/tests.py
deleted file mode 100644
index 7ce503c..0000000
--- a/resources/tests.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from django.test import TestCase
-
-# Create your tests here.
diff --git a/resources/tests/__init__.py b/resources/tests/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/resources/tests/__init__.py
@@ -0,0 +1 @@
+
diff --git a/resources/tests/test_robots.py b/resources/tests/test_robots.py
new file mode 100644
index 0000000..4d6bf74
--- /dev/null
+++ b/resources/tests/test_robots.py
@@ -0,0 +1,113 @@
+from unittest.mock import patch
+
+from django.test import TestCase
+
+from resources import models, tasks
+
+
+class RobotUserAgentModelTests(TestCase):
+ def test_manual_robot_defaults_to_scielo_source(self):
+ robot = models.RobotUserAgent.objects.create(pattern="CustomBot")
+
+ robot.refresh_from_db()
+
+ self.assertFalse(robot.source_counter)
+ self.assertTrue(robot.source_scielo)
+ self.assertTrue(robot.is_active)
+ self.assertEqual(robot.source_labels, "SciELO")
+
+ def test_get_all_patterns_only_returns_active_patterns(self):
+ active = models.RobotUserAgent.objects.create(
+ pattern="ActiveBot",
+ source_scielo=True,
+ is_active=True,
+ )
+ models.RobotUserAgent.objects.create(
+ pattern="InactiveBot",
+ source_scielo=True,
+ is_active=False,
+ )
+
+ self.assertListEqual(list(models.RobotUserAgent.get_all_patterns()), [active.pattern])
+
+ def test_get_patterns_can_filter_by_source(self):
+ counter_only = models.RobotUserAgent.objects.create(
+ pattern="CounterOnlyBot",
+ source_counter=True,
+ source_scielo=False,
+ is_active=True,
+ )
+ shared = models.RobotUserAgent.objects.create(
+ pattern="SharedBot",
+ source_counter=True,
+ source_scielo=True,
+ is_active=True,
+ )
+ scielo_only = models.RobotUserAgent.objects.create(
+ pattern="ScieloOnlyBot",
+ source_counter=False,
+ source_scielo=True,
+ is_active=True,
+ )
+
+ self.assertCountEqual(
+ list(models.RobotUserAgent.get_patterns(source="counter")),
+ [counter_only.pattern, shared.pattern],
+ )
+ self.assertCountEqual(
+ list(models.RobotUserAgent.get_patterns(source="scielo")),
+ [shared.pattern, scielo_only.pattern],
+ )
+
+ def test_get_patterns_rejects_invalid_source(self):
+ with self.assertRaises(ValueError):
+ list(models.RobotUserAgent.get_patterns(source="invalid"))
+
+
+class LoadRobotsTaskTests(TestCase):
+
+ @patch("resources.tasks.utils.fetch_data")
+ def test_task_load_robots_marks_counter_source_and_deactivates_stale_counter_entries(
+ self,
+ mock_fetch_data,
+ ):
+ mock_fetch_data.return_value = [
+ {"pattern": "CounterBot", "last_changed": "2025-01-15"},
+ {"pattern": "SharedBot", "last_changed": "2025-01-20"},
+ ]
+
+ stale_counter = models.RobotUserAgent.objects.create(
+ pattern="OldCounterBot",
+ source_counter=True,
+ is_active=True,
+ last_changed="2024-12-01",
+ source_url="https://old.example.org/robots.json",
+ )
+ shared_bot = models.RobotUserAgent.objects.create(
+ pattern="SharedBot",
+ source_scielo=True,
+ is_active=True,
+ )
+
+ result = tasks.task_load_robots.run(
+ url_robots="https://counter.example.org/robots.json",
+ )
+
+ self.assertTrue(result)
+
+ counter_bot = models.RobotUserAgent.objects.get(pattern="CounterBot")
+ self.assertTrue(counter_bot.source_counter)
+ self.assertFalse(counter_bot.source_scielo)
+ self.assertTrue(counter_bot.is_active)
+ self.assertEqual(counter_bot.source_url, "https://counter.example.org/robots.json")
+
+ shared_bot.refresh_from_db()
+ self.assertTrue(shared_bot.source_counter)
+ self.assertTrue(shared_bot.source_scielo)
+ self.assertTrue(shared_bot.is_active)
+
+ stale_counter.refresh_from_db()
+ self.assertFalse(stale_counter.source_counter)
+ self.assertFalse(stale_counter.is_active)
+ self.assertIsNone(stale_counter.source_url)
+ self.assertIsNone(stale_counter.last_changed)
diff --git a/resources/wagtail_hooks.py b/resources/wagtail_hooks.py
index 758bb53..c347b22 100644
--- a/resources/wagtail_hooks.py
+++ b/resources/wagtail_hooks.py
@@ -15,13 +15,25 @@ class RobotUserAgentSnippetViewSet(SnippetViewSet):
list_display = (
"pattern",
+ "source_labels",
+ "is_active",
"last_changed",
)
search_fields = (
"pattern",
+ "source_url",
+ )
+ list_filter = (
+ "source_counter",
+ "source_scielo",
+ "is_active",
)
list_export = (
"pattern",
+ "source_counter",
+ "source_scielo",
+ "is_active",
+ "source_url",
"last_changed",
)
export_filename = "robots"
diff --git a/source/__init__.py b/source/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/source/__init__.py
@@ -0,0 +1 @@
+
diff --git a/article/apps.py b/source/apps.py
similarity index 63%
rename from article/apps.py
rename to source/apps.py
index 8c0e2c9..06d886d 100644
--- a/article/apps.py
+++ b/source/apps.py
@@ -1,6 +1,6 @@
from django.apps import AppConfig
-class ArticleConfig(AppConfig):
+class SourceConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
- name = "article"
+ name = "source"
diff --git a/source/migrations/0001_initial.py b/source/migrations/0001_initial.py
new file mode 100644
index 0000000..cc736e3
--- /dev/null
+++ b/source/migrations/0001_initial.py
@@ -0,0 +1,210 @@
+# Generated by Django 5.0.7 on 2026-03-15 00:00
+
+import django.db.models.deletion
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ initial = True
+
+ dependencies = [
+ ("collection", "0001_initial"),
+ migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name="Source",
+ fields=[
+ (
+ "id",
+ models.BigAutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ (
+ "created",
+ models.DateTimeField(
+ auto_now_add=True,
+ verbose_name="Creation date",
+ ),
+ ),
+ (
+ "updated",
+ models.DateTimeField(
+ auto_now=True,
+ verbose_name="Last update date",
+ ),
+ ),
+ (
+ "source_type",
+ models.CharField(
+ choices=[
+ ("journal", "Journal"),
+ ("book", "Book"),
+ ("preprint_server", "Preprint Server"),
+ ("data_repository", "Data Repository"),
+ ("other", "Other"),
+ ],
+ db_index=True,
+ max_length=32,
+ verbose_name="Source Type",
+ ),
+ ),
+ (
+ "source_id",
+ models.CharField(
+ db_index=True,
+ max_length=255,
+ verbose_name="Source ID",
+ ),
+ ),
+ (
+ "scielo_issn",
+ models.CharField(
+ blank=True,
+ db_index=True,
+ max_length=9,
+ null=True,
+ verbose_name="SciELO ISSN",
+ ),
+ ),
+ (
+ "acronym",
+ models.CharField(
+ blank=True,
+ default="",
+ max_length=64,
+ null=True,
+ verbose_name="Source Acronym",
+ ),
+ ),
+ (
+ "title",
+ models.CharField(
+ max_length=255,
+ verbose_name="Source Title",
+ ),
+ ),
+ (
+ "identifiers",
+ models.JSONField(
+ blank=True,
+ default=dict,
+ null=True,
+ verbose_name="Identifiers",
+ ),
+ ),
+ (
+ "publisher_name",
+ models.JSONField(
+ blank=True,
+ default=list,
+ null=True,
+ verbose_name="Publisher Name",
+ ),
+ ),
+ (
+ "subject_areas",
+ models.JSONField(
+ default=list,
+ verbose_name="Subject Areas (CAPES)",
+ ),
+ ),
+ (
+ "wos_subject_areas",
+ models.JSONField(
+ default=list,
+ verbose_name="Subject Areas (WoS)",
+ ),
+ ),
+ (
+ "default_lang",
+ models.CharField(
+ blank=True,
+ max_length=8,
+ null=True,
+ verbose_name="Default Language",
+ ),
+ ),
+ (
+ "publication_date",
+ models.CharField(
+ blank=True,
+ max_length=32,
+ null=True,
+ verbose_name="Publication Date",
+ ),
+ ),
+ (
+ "publication_year",
+ models.CharField(
+ blank=True,
+ db_index=True,
+ max_length=4,
+ null=True,
+ verbose_name="Publication Year",
+ ),
+ ),
+ (
+ "extra_data",
+ models.JSONField(
+ blank=True,
+ default=dict,
+ null=True,
+ verbose_name="Extra Data",
+ ),
+ ),
+ (
+ "collection",
+ models.ForeignKey(
+ on_delete=django.db.models.deletion.CASCADE,
+ to="collection.collection",
+ verbose_name="Collection",
+ ),
+ ),
+ (
+ "creator",
+ models.ForeignKey(
+ editable=False,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="%(class)s_creator",
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Creator",
+ ),
+ ),
+ (
+ "updated_by",
+ models.ForeignKey(
+ blank=True,
+ editable=False,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="%(class)s_last_mod_user",
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Updater",
+ ),
+ ),
+ ],
+ options={
+ "verbose_name": "Source",
+ "verbose_name_plural": "Sources",
+ "unique_together": {("collection", "source_type", "source_id")},
+ "indexes": [
+ models.Index(
+ fields=["collection", "source_type"],
+ name="source_collection_type_idx",
+ ),
+ models.Index(
+ fields=["collection", "scielo_issn"],
+ name="source_collection_issn_idx",
+ ),
+ ],
+ },
+ ),
+ ]
diff --git a/source/migrations/0002_source_access_type.py b/source/migrations/0002_source_access_type.py
new file mode 100644
index 0000000..e148c15
--- /dev/null
+++ b/source/migrations/0002_source_access_type.py
@@ -0,0 +1,25 @@
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("source", "0001_initial"),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name="source",
+ name="access_type",
+ field=models.CharField(
+ blank=True,
+ choices=[
+ ("open_access", "Open Access"),
+ ("commercial", "Commercial"),
+ ],
+ db_index=True,
+ max_length=32,
+ null=True,
+ verbose_name="Access Type",
+ ),
+ ),
+ ]
diff --git a/source/migrations/0003_alter_source_title.py b/source/migrations/0003_alter_source_title.py
new file mode 100644
index 0000000..354a82a
--- /dev/null
+++ b/source/migrations/0003_alter_source_title.py
@@ -0,0 +1,15 @@
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("source", "0002_source_access_type"),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name="source",
+ name="title",
+ field=models.CharField(max_length=500, verbose_name="Source Title"),
+ ),
+ ]
diff --git a/source/migrations/__init__.py b/source/migrations/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/source/migrations/__init__.py
@@ -0,0 +1 @@
+
diff --git a/source/models.py b/source/models.py
new file mode 100644
index 0000000..48d3e00
--- /dev/null
+++ b/source/models.py
@@ -0,0 +1,219 @@
+from django.db import models
+from django.utils.translation import gettext_lazy as _
+
+from collection.models import Collection
+from core.models import CommonControlField
+
+
+class Source(CommonControlField):
+ SOURCE_TYPE_JOURNAL = "journal"
+ SOURCE_TYPE_BOOK = "book"
+ SOURCE_TYPE_PREPRINT_SERVER = "preprint_server"
+ SOURCE_TYPE_DATA_REPOSITORY = "data_repository"
+ SOURCE_TYPE_OTHER = "other"
+ SOURCE_TYPE_CHOICES = (
+ (SOURCE_TYPE_JOURNAL, _("Journal")),
+ (SOURCE_TYPE_BOOK, _("Book")),
+ (SOURCE_TYPE_PREPRINT_SERVER, _("Preprint Server")),
+ (SOURCE_TYPE_DATA_REPOSITORY, _("Data Repository")),
+ (SOURCE_TYPE_OTHER, _("Other")),
+ )
+
+ ACCESS_TYPE_OPEN_ACCESS = "open_access"
+ ACCESS_TYPE_COMMERCIAL = "commercial"
+ ACCESS_TYPE_CHOICES = (
+ (ACCESS_TYPE_OPEN_ACCESS, _("Open Access")),
+ (ACCESS_TYPE_COMMERCIAL, _("Commercial")),
+ )
+
+ collection = models.ForeignKey(
+ Collection,
+ verbose_name=_("Collection"),
+ on_delete=models.CASCADE,
+ blank=False,
+ null=False,
+ db_index=True,
+ )
+
+ source_type = models.CharField(
+ verbose_name=_("Source Type"),
+ max_length=32,
+ choices=SOURCE_TYPE_CHOICES,
+ blank=False,
+ null=False,
+ db_index=True,
+ )
+
+ source_id = models.CharField(
+ verbose_name=_("Source ID"),
+ max_length=255,
+ blank=False,
+ null=False,
+ db_index=True,
+ )
+
+ scielo_issn = models.CharField(
+ verbose_name=_("SciELO ISSN"),
+ max_length=9,
+ blank=True,
+ null=True,
+ db_index=True,
+ )
+
+ acronym = models.CharField(
+ verbose_name=_("Source Acronym"),
+ max_length=64,
+ blank=True,
+ null=True,
+ default="",
+ )
+
+ title = models.CharField(
+ verbose_name=_("Source Title"),
+ max_length=500,
+ blank=False,
+ null=False,
+ )
+
+ identifiers = models.JSONField(
+ verbose_name=_("Identifiers"),
+ null=True,
+ blank=True,
+ default=dict,
+ )
+
+ publisher_name = models.JSONField(
+ verbose_name=_("Publisher Name"),
+ blank=True,
+ null=True,
+ default=list,
+ )
+
+ subject_areas = models.JSONField(
+ verbose_name=_("Subject Areas (CAPES)"),
+ null=False,
+ blank=False,
+ default=list,
+ )
+
+ wos_subject_areas = models.JSONField(
+ verbose_name=_("Subject Areas (WoS)"),
+ null=False,
+ blank=False,
+ default=list,
+ )
+
+ default_lang = models.CharField(
+ verbose_name=_("Default Language"),
+ max_length=8,
+ blank=True,
+ null=True,
+ )
+
+ publication_date = models.CharField(
+ verbose_name=_("Publication Date"),
+ max_length=32,
+ blank=True,
+ null=True,
+ )
+
+ publication_year = models.CharField(
+ verbose_name=_("Publication Year"),
+ max_length=4,
+ blank=True,
+ null=True,
+ db_index=True,
+ )
+
+ access_type = models.CharField(
+ verbose_name=_("Access Type"),
+ max_length=32,
+ choices=ACCESS_TYPE_CHOICES,
+ blank=True,
+ null=True,
+ db_index=True,
+ )
+
+ extra_data = models.JSONField(
+ verbose_name=_("Extra Data"),
+ null=True,
+ blank=True,
+ default=dict,
+ )
+
+ def __str__(self):
+ return f"{self.collection.acron3} - {self.source_type} - {self.source_id}"
+
+ @staticmethod
+ def _extract_issns(identifiers):
+ if not isinstance(identifiers, dict):
+ return set()
+
+ return {
+ value
+ for key, value in identifiers.items()
+ if value and "issn" in str(key).lower()
+ }
+
+ @classmethod
+ def metadata(cls, collection=None):
+ queryset = cls.objects.select_related("collection").only(
+ "acronym",
+ "collection__acron3",
+ "default_lang",
+ "extra_data",
+ "identifiers",
+ "publication_date",
+ "publication_year",
+ "access_type",
+ "publisher_name",
+ "scielo_issn",
+ "source_id",
+ "source_type",
+ "subject_areas",
+ "title",
+ "wos_subject_areas",
+ )
+
+ if collection:
+ queryset = queryset.filter(collection=collection)
+
+ for source in queryset.iterator():
+ identifiers = source.identifiers or {}
+ yield {
+ "acronym": source.acronym,
+ "collection": source.collection.acron3,
+ "default_lang": source.default_lang,
+ "extra_data": source.extra_data or {},
+ "identifiers": identifiers,
+ "issns": cls._extract_issns(identifiers),
+ "publication_date": source.publication_date,
+ "publication_year": source.publication_year,
+ "access_type": source.access_type,
+ "publisher_name": source.publisher_name or [],
+ "scielo_issn": source.scielo_issn,
+ "source_id": source.source_id,
+ "source_type": source.source_type,
+ "subject_areas": source.subject_areas or [],
+ "title": source.title,
+ "wos_subject_areas": source.wos_subject_areas or [],
+ }
+
+ class Meta:
+ verbose_name = _("Source")
+ verbose_name_plural = _("Sources")
+ unique_together = (
+ "collection",
+ "source_type",
+ "source_id",
+ )
+ indexes = [
+ models.Index(
+ fields=["collection", "source_type"],
+ name="source_collection_type_idx",
+ ),
+ models.Index(
+ fields=["collection", "scielo_issn"],
+ name="source_collection_issn_idx",
+ ),
+ ]
diff --git a/source/services/__init__.py b/source/services/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/source/services/__init__.py
@@ -0,0 +1 @@
+
diff --git a/source/services/books.py b/source/services/books.py
new file mode 100644
index 0000000..df9bd4d
--- /dev/null
+++ b/source/services/books.py
@@ -0,0 +1,137 @@
+from collection.models import Collection
+from source.models import Source
+
+
+BOOKS_COLLECTION_ACRONYM = "books"
+
+
+def get_books_collection(acronym=BOOKS_COLLECTION_ACRONYM):
+ return Collection.objects.get(acron3=acronym)
+
+
+def upsert_monograph_source(
+ payload,
+ collection,
+ user=None,
+ force_update=True,
+ source_url=None,
+ last_seq=None,
+):
+ if payload.get("TYPE") != "Monograph":
+ return None
+
+ source, created = Source.objects.get_or_create(
+ collection=collection,
+ source_type=Source.SOURCE_TYPE_BOOK,
+ source_id=str(payload.get("id")),
+ )
+
+ if created and user:
+ source.creator = user
+
+ if created or force_update:
+ source.scielo_issn = None
+ source.acronym = ""
+ source.title = payload.get("title") or str(payload.get("id"))
+ source.identifiers = _build_source_identifiers(payload)
+ source.publisher_name = _as_list(payload.get("publisher"))
+ source.subject_areas = []
+ source.wos_subject_areas = []
+ source.default_lang = payload.get("language") or None
+ source.publication_date = payload.get("publication_date") or None
+ source.publication_year = _normalize_year(payload.get("year"))
+ source.access_type = _normalize_access_type(payload.get("is_comercial"))
+ source.extra_data = _build_source_extra_data(
+ payload,
+ source_url=source_url,
+ last_seq=last_seq,
+ )
+
+ if user:
+ source.updated_by = user
+
+ source.save()
+ return source
+
+
+def delete_book_source(collection, book_id):
+ return Source.objects.filter(
+ collection=collection,
+ source_type=Source.SOURCE_TYPE_BOOK,
+ source_id=str(book_id),
+ ).delete()
+
+
+def _build_source_identifiers(payload):
+ identifiers = {
+ "book_id": str(payload.get("id")) if payload.get("id") is not None else None,
+ "isbn": payload.get("isbn"),
+ "eisbn": payload.get("eisbn"),
+ "doi": payload.get("doi_number"),
+ }
+ return _compact_dict(identifiers)
+
+
+def _build_source_extra_data(payload, source_url=None, last_seq=None):
+ extra_data = {
+ "raw_type": payload.get("TYPE"),
+ "source_url": source_url,
+ "last_seq": last_seq,
+ "visible": payload.get("visible"),
+ "city": payload.get("city"),
+ "country": payload.get("country"),
+ "pages": payload.get("pages"),
+ "collection_data": payload.get("collection"),
+ "creators": payload.get("creators"),
+ "is_comercial": payload.get("is_comercial"),
+ "use_licence": payload.get("use_licence"),
+ "price_reais": payload.get("price_reais"),
+ "price_dollar": payload.get("price_dollar"),
+ "shopping_info": payload.get("shopping_info"),
+ "serie": payload.get("serie"),
+ "format": payload.get("format"),
+ "translated_titles": payload.get("translated_titles"),
+ "translated_synopses": payload.get("translated_synopses"),
+ "synopsis": payload.get("synopsis"),
+ "primary_descriptor": payload.get("primary_descriptor"),
+ "translated_primary_descriptors": payload.get("translated_primary_descriptors"),
+ }
+ return _compact_dict(extra_data)
+
+
+def _as_list(value):
+ if not value:
+ return []
+
+ if isinstance(value, list):
+ return value
+
+ return [value]
+
+
+def _normalize_year(value):
+ if value in (None, ""):
+ return None
+ return str(value)[:4]
+
+
+def _normalize_access_type(value):
+ if value in (None, ""):
+ return None
+
+ if isinstance(value, str):
+ normalized = value.strip().lower()
+ if normalized in {"true", "1", "yes", "y", "sim"}:
+ return Source.ACCESS_TYPE_COMMERCIAL
+ if normalized in {"false", "0", "no", "n", "nao", "não"}:
+ return Source.ACCESS_TYPE_OPEN_ACCESS
+
+ return Source.ACCESS_TYPE_COMMERCIAL if bool(value) else Source.ACCESS_TYPE_OPEN_ACCESS
+
+
+def _compact_dict(data):
+ return {
+ key: value
+ for key, value in data.items()
+ if value not in (None, "", [], {}, ())
+ }
diff --git a/source/services/journals.py b/source/services/journals.py
new file mode 100644
index 0000000..ac133f6
--- /dev/null
+++ b/source/services/journals.py
@@ -0,0 +1,118 @@
+from django.db.models import Q
+
+from collection.models import Collection
+from source.models import Source
+
+
+def get_collection(acronym):
+ return Collection.objects.filter(acron3=acronym).first()
+
+
+def upsert_journal_source(
+ journal,
+ collection,
+ user=None,
+ force_update=True,
+ load_mode=None,
+):
+ scielo_issn = _value(journal, "scielo_issn")
+ if not scielo_issn:
+ return None
+
+ source, created = Source.objects.get_or_create(
+ collection=collection,
+ source_type=Source.SOURCE_TYPE_JOURNAL,
+ source_id=scielo_issn,
+ )
+
+ if created and user:
+ source.creator = user
+
+ if created or force_update:
+ source.scielo_issn = scielo_issn
+ source.acronym = _value(journal, "acronym") or ""
+ source.title = _value(journal, "title") or scielo_issn
+ source.identifiers = _build_source_identifiers(journal)
+ source.publisher_name = _as_list(_value(journal, "publisher_name"))
+ source.subject_areas = _as_list(_value(journal, "subject_areas"))
+ source.wos_subject_areas = _as_list(_value(journal, "wos_subject_areas"))
+ source.default_lang = None
+ source.publication_date = None
+ source.publication_year = None
+ source.extra_data = _compact_dict(
+ {
+ "collection_acronym": _value(journal, "collection_acronym"),
+ "load_mode": load_mode,
+ }
+ )
+
+ if user:
+ source.updated_by = user
+
+ source.save()
+ return source
+
+
+def find_journal_source_by_issns(collection, issns):
+ for issn in filter(None, issns or []):
+ source = (
+ Source.objects.filter(
+ collection=collection,
+ source_type=Source.SOURCE_TYPE_JOURNAL,
+ )
+ .filter(
+ Q(scielo_issn=issn)
+ | Q(source_id=issn)
+ | Q(identifiers__electronic_issn=issn)
+ | Q(identifiers__print_issn=issn)
+ | Q(identifiers__scielo_issn=issn)
+ )
+ .first()
+ )
+ if source:
+ return source
+ return None
+
+
+def find_journal_source_by_acronym(collection, acronym):
+ if not acronym:
+ return None
+
+ return Source.objects.filter(
+ collection=collection,
+ source_type=Source.SOURCE_TYPE_JOURNAL,
+ acronym=acronym,
+ ).first()
+
+
+def _build_source_identifiers(journal):
+ identifiers = {
+ "electronic_issn": _value(journal, "electronic_issn"),
+ "print_issn": _value(journal, "print_issn"),
+ "scielo_issn": _value(journal, "scielo_issn"),
+ }
+ return _compact_dict(identifiers)
+
+
+def _as_list(value):
+ if not value:
+ return []
+
+ if isinstance(value, list):
+ return value
+
+ return [value]
+
+
+def _value(data, key, default=None):
+ if isinstance(data, dict):
+ return data.get(key, default)
+ return getattr(data, key, default)
+
+
+def _compact_dict(data):
+ return {
+ key: value
+ for key, value in data.items()
+ if value not in (None, "", [], {}, ())
+ }
diff --git a/source/tasks.py b/source/tasks.py
new file mode 100644
index 0000000..eb1633b
--- /dev/null
+++ b/source/tasks.py
@@ -0,0 +1,148 @@
+import logging
+
+from django.utils.translation import gettext as _
+from django.conf import settings
+
+from collection.models import Collection
+from config import celery_app
+from core.collectors import articlemeta as articlemeta_collector
+from core.collectors import scielo_books as scielo_books_collector
+from core.utils.request_utils import _get_user
+from source.services import books as books_service
+from source.services import journals as journal_service
+
+
+def load_sources_from_article_meta(
+ collections=None,
+ force_update=True,
+ user=None,
+ mode="thrift",
+):
+ collection_codes = collections or Collection.acron3_list()
+
+ for collection_code in collection_codes:
+ logging.info(
+ "Loading sources from Article Meta. Collection: %s, Mode: %s",
+ collection_code,
+ mode,
+ )
+
+ for journal in articlemeta_collector.iter_journals(
+ collection=collection_code,
+ mode=mode,
+ ):
+ collection = journal_service.get_collection(journal.collection_acronym)
+ if not collection:
+ logging.error(
+ "Collection %s does not exist",
+ journal.collection_acronym,
+ )
+ continue
+
+ source = journal_service.upsert_journal_source(
+ journal,
+ collection=collection,
+ user=user,
+ force_update=force_update,
+ load_mode=mode,
+ )
+ logging.info(
+ "Source %s upserted for collection %s",
+ source.source_id if source else None,
+ collection.acron3,
+ )
+
+ return True
+
+
+def load_sources_from_scielo_books(
+ collection="books",
+ db_name=settings.SCIELO_BOOKS_DB_NAME,
+ since=0,
+ limit=settings.SCIELO_BOOKS_LIMIT,
+ force_update=True,
+ headers=None,
+ base_url=None,
+ user=None,
+):
+ collection_obj = books_service.get_books_collection(collection)
+
+ logging.info(
+ "Loading sources from SciELO Books. Collection: %s, DB: %s, Since: %s, Limit: %s",
+ collection,
+ db_name,
+ since,
+ limit,
+ )
+
+ for item in scielo_books_collector.iter_change_documents(
+ base_url=base_url,
+ db_name=db_name,
+ since=since,
+ limit=limit,
+ headers=headers,
+ ):
+ change = item["change"]
+
+ if item["deleted"]:
+ books_service.delete_book_source(collection_obj, change.get("id"))
+ continue
+
+ payload = item["payload"] or {}
+ if payload.get("TYPE") != "Monograph":
+ continue
+
+ books_service.upsert_monograph_source(
+ payload,
+ collection=collection_obj,
+ user=user,
+ force_update=force_update,
+ source_url=item.get("source_url"),
+ last_seq=change.get("seq"),
+ )
+
+ return True
+
+
+@celery_app.task(bind=True, name=_("[Metadata] Sync Sources (Article Meta)"), queue="load")
+def task_load_sources_from_article_meta(
+ self,
+ collections=None,
+ force_update=True,
+ user_id=None,
+ username=None,
+ mode="thrift",
+):
+ user = _get_user(self.request, username=username, user_id=user_id)
+ return load_sources_from_article_meta(
+ collections=collections,
+ force_update=force_update,
+ user=user,
+ mode=mode,
+ )
+
+
+@celery_app.task(bind=True, name=_("[Metadata] Sync Sources (SciELO Books)"), queue="load")
+def task_load_sources_from_scielo_books(
+ self,
+ collection="books",
+ db_name=settings.SCIELO_BOOKS_DB_NAME,
+ since=0,
+ limit=settings.SCIELO_BOOKS_LIMIT,
+ force_update=True,
+ headers=None,
+ base_url=None,
+ user_id=None,
+ username=None,
+):
+ user = _get_user(self.request, username=username, user_id=user_id)
+ return load_sources_from_scielo_books(
+ collection=collection,
+ db_name=db_name,
+ since=since,
+ limit=limit,
+ force_update=force_update,
+ headers=headers,
+ base_url=base_url,
+ user=user,
+ )
diff --git a/source/tests.py b/source/tests.py
new file mode 100644
index 0000000..a182f4e
--- /dev/null
+++ b/source/tests.py
@@ -0,0 +1,133 @@
+from django.test import TestCase
+
+from collection.models import Collection
+
+from .models import Source
+from .services import books as books_service
+from .services import journals as journal_service
+
+
+class SourceMetadataTests(TestCase):
+ def test_source_type_choices_include_scielo_non_journal_sources(self):
+ self.assertIn(
+ (Source.SOURCE_TYPE_PREPRINT_SERVER, "Preprint Server"),
+ [(value, str(label)) for value, label in Source.SOURCE_TYPE_CHOICES],
+ )
+ self.assertIn(
+ (Source.SOURCE_TYPE_DATA_REPOSITORY, "Data Repository"),
+ [(value, str(label)) for value, label in Source.SOURCE_TYPE_CHOICES],
+ )
+
+ def test_metadata_exposes_generic_and_journal_fields(self):
+ collection = Collection.objects.create(acron3="scl", acron2="sc")
+ Source.objects.create(
+ collection=collection,
+ source_type=Source.SOURCE_TYPE_JOURNAL,
+ source_id="1234-5678",
+ scielo_issn="1234-5678",
+ acronym="testjou",
+ title="Test Journal",
+ identifiers={
+ "electronic_issn": "1234-5678",
+ "print_issn": "8765-4321",
+ "doi": "10.1590/example",
+ },
+ publisher_name=["SciELO"],
+ subject_areas=["Health Sciences"],
+ wos_subject_areas=["Medicine"],
+ default_lang="en",
+ publication_date="2024-01-15",
+ publication_year="2024",
+ extra_data={"country": "BR"},
+ )
+
+ metadata = list(Source.metadata(collection=collection))
+
+ self.assertEqual(len(metadata), 1)
+ self.assertEqual(metadata[0]["source_type"], Source.SOURCE_TYPE_JOURNAL)
+ self.assertEqual(metadata[0]["source_id"], "1234-5678")
+ self.assertEqual(metadata[0]["scielo_issn"], "1234-5678")
+ self.assertEqual(metadata[0]["issns"], {"1234-5678", "8765-4321"})
+ self.assertEqual(metadata[0]["title"], "Test Journal")
+
+ def test_upsert_monograph_source_maps_scielo_books_payload(self):
+ collection = Collection.objects.create(acron3="books", acron2="bk")
+
+ source = books_service.upsert_monograph_source(
+ {
+ "TYPE": "Monograph",
+ "id": "abcd1",
+ "title": "Sample Book",
+ "isbn": "9788578791889",
+ "eisbn": "9788578791880",
+ "doi_number": "10.1234/book",
+ "language": "pt",
+ "publication_date": "2024-05-20",
+ "year": "2024",
+ "publisher": "SciELO Books",
+ "is_comercial": False,
+ "visible": True,
+ },
+ collection=collection,
+ )
+
+ self.assertEqual(source.source_type, Source.SOURCE_TYPE_BOOK)
+ self.assertEqual(source.source_id, "abcd1")
+ self.assertEqual(source.identifiers["isbn"], "9788578791889")
+ self.assertEqual(source.default_lang, "pt")
+ self.assertEqual(source.publication_year, "2024")
+ self.assertEqual(source.access_type, Source.ACCESS_TYPE_OPEN_ACCESS)
+
+ def test_upsert_monograph_source_accepts_long_real_world_title(self):
+ collection = Collection.objects.create(acron3="books", acron2="bk")
+ title = (
+ "O Estado da Arte sobre Refugiados, Deslocados Internos, "
+ "Deslocados Ambientais e Apatridas no Brasil: atualizacao do "
+ "Diretorio Nacional do ACNUR de teses, dissertacoes, trabalhos "
+ "de conclusao de curso de graduacao em Joao Pessoa (Paraiba) e "
+ "artigos (2007 a 2017)"
+ )
+
+ source = books_service.upsert_monograph_source(
+ {
+ "TYPE": "Monograph",
+ "id": "9zzts",
+ "title": title,
+ },
+ collection=collection,
+ )
+
+ self.assertEqual(source.title, title)
+
+ def test_upsert_journal_source_maps_articlemeta_payload(self):
+ collection = Collection.objects.create(acron3="scl", acron2="sc")
+
+ source = journal_service.upsert_journal_source(
+ {
+ "collection_acronym": "scl",
+ "scielo_issn": "1234-5678",
+ "electronic_issn": "1234-5678",
+ "print_issn": "8765-4321",
+ "acronym": "testjou",
+ "title": "Test Journal",
+ "publisher_name": "SciELO",
+ "subject_areas": ["Health Sciences"],
+ "wos_subject_areas": ["Medicine"],
+ },
+ collection=collection,
+ load_mode="thrift",
+ )
+
+ self.assertEqual(source.source_type, Source.SOURCE_TYPE_JOURNAL)
+ self.assertEqual(source.source_id, "1234-5678")
+ self.assertEqual(source.identifiers["electronic_issn"], "1234-5678")
+ self.assertEqual(source.publisher_name, ["SciELO"])
+ self.assertEqual(source.extra_data["load_mode"], "thrift")
+ self.assertEqual(
+ journal_service.find_journal_source_by_issns(collection, ["8765-4321"]).pk,
+ source.pk,
+ )
+ self.assertEqual(
+ journal_service.find_journal_source_by_acronym(collection, "testjou").pk,
+ source.pk,
+ )
diff --git a/source/wagtail_hooks.py b/source/wagtail_hooks.py
new file mode 100644
index 0000000..5ffad62
--- /dev/null
+++ b/source/wagtail_hooks.py
@@ -0,0 +1,32 @@
+from django.utils.translation import gettext_lazy as _
+from wagtail.snippets.views.snippets import SnippetViewSet
+
+from .models import Source
+
+
+class SourceSnippetViewSet(SnippetViewSet):
+ model = Source
+ icon = "folder-open-inverse"
+ menu_label = _("Source")
+ menu_order = 200
+
+ list_display = (
+ "collection",
+ "source_type",
+ "source_id",
+ "scielo_issn",
+ "acronym",
+ "title",
+ "publication_year",
+ )
+ list_filter = (
+ "collection",
+ "source_type",
+ "publication_year",
+ )
+ search_fields = (
+ "source_id",
+ "scielo_issn",
+ "acronym",
+ "title",
+ )
diff --git a/start-dev.sh b/start-dev.sh
deleted file mode 100644
index 92d064a..0000000
--- a/start-dev.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-# Change this value to the local ethernet.
-ethernet=wlp0s20f3
-
-# Linux IP.
-export IP=$(/sbin/ip -o -4 addr list $ethernet | awk '{print $4}' | cut -d/ -f1)
-
-# Mac OS IP.
-#export IP=$(ifconfig $ethernet | grep inet | grep -v inet6 | awk '{print $2}')
-
-export DATABASE_URL=postgres://GVRFlLmcCNfGLhsFvSnCioYOPJPYpyfj:BQ4hSUL4rdj5WZLdR8ilDLRQMvCtzo0caMaXDO0olGsmycQjlcZlTVK9DepZR8kk@$IP:5432/scielo_core
-export CELERY_BROKER_URL=redis://$IP:6379/0
-export USE_DOCKER=no
-export IPYTHONDIR=/app/.ipython
-export REDIS_URL=redis://$IP:6379/0
-export CELERY_FLOWER_USER=PhFRdLexbrsBvrrbSXxjcMMOcVOavCrZ
-export CELERY_FLOWER_PASSWORD=QgScyefPrYhHgO6onW61u0nazc5xdBuP4sM7jMRrBBFuA2RjsFhZLp7xbVYZbrwR
-export EMAIL_HOST=$IP
-export SOLR_URL=http://$IP:8983/solr/
-
-
-docker stop scielo_core_local_django
-# workon scms
-python manage.py runserver_plus 0.0.0.0:8000
diff --git a/tracker/choices.py b/tracker/choices.py
index e2c80e2..dfc562c 100644
--- a/tracker/choices.py
+++ b/tracker/choices.py
@@ -1,54 +1,16 @@
from django.utils.translation import gettext_lazy as _
-ERROR = "ERROR"
-EXCEPTION = "EXCEPTION"
-INFO = "INFO"
-WARNING = "WARNING"
-
-EVENT_MSG_TYPE = [
- (ERROR, _("error")),
- (WARNING, _("warning")),
- (INFO, _("info")),
- (EXCEPTION, _("exception")),
-]
-
-
-PROGRESS_STATUS_IGNORED = "IGNORED"
-PROGRESS_STATUS_REPROC = "REPROC"
-PROGRESS_STATUS_TODO = "TODO"
-PROGRESS_STATUS_DOING = "DOING"
-PROGRESS_STATUS_DONE = "DONE"
-PROGRESS_STATUS_PENDING = "PENDING"
-
-PROGRESS_STATUS = (
- (PROGRESS_STATUS_REPROC, _("To reprocess")),
- (PROGRESS_STATUS_TODO, _("To do")),
- (PROGRESS_STATUS_DONE, _("Done")),
- (PROGRESS_STATUS_DOING, _("Doing")),
- (PROGRESS_STATUS_PENDING, _("Pending")),
- (PROGRESS_STATUS_IGNORED, _("ignored")),
-)
-
LOG_FILE_DISCARDED_LINE_REASON_MISSING_METADATA = 'MET'
-LOG_FILE_DISCARDED_LINE_REASON_MISSING_ARTICLE = 'ART'
-LOG_FILE_DISCARDED_LINE_REASON_MISSING_JOURNAL = 'JOU'
+LOG_FILE_DISCARDED_LINE_REASON_MISSING_DOCUMENT = 'DOC'
+LOG_FILE_DISCARDED_LINE_REASON_MISSING_SOURCE = 'SRC'
LOG_FILE_DISCARDED_LINE_REASON_URL_TRANSLATION = 'URL'
LOG_FILE_DISCARDED_LINE_REASON_DATABASE_ERROR = 'DBE'
LOG_FILE_DISCARDED_LINE_REASON = [
(LOG_FILE_DISCARDED_LINE_REASON_MISSING_METADATA, _("Missing Metadata")),
- (LOG_FILE_DISCARDED_LINE_REASON_MISSING_ARTICLE, _("Missing PIDv2 or PIDv3 or PID Generic")),
- (LOG_FILE_DISCARDED_LINE_REASON_MISSING_JOURNAL, _("Missing ISSN")),
+ (LOG_FILE_DISCARDED_LINE_REASON_MISSING_DOCUMENT, _("Missing Document")),
+ (LOG_FILE_DISCARDED_LINE_REASON_MISSING_SOURCE, _("Missing Source")),
(LOG_FILE_DISCARDED_LINE_REASON_URL_TRANSLATION, _("URL Translation")),
(LOG_FILE_DISCARDED_LINE_REASON_DATABASE_ERROR, _("Database Error")),
]
-
-
-ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED = 'MUL'
-ARTICLE_EVENT_TYPE_DATA_ERROR = 'ERR'
-
-ARTICLE_EVENT_TYPE = [
- (ARTICLE_EVENT_TYPE_MULTIPLE_OBJS_RETURNED, _("Multiple Articles Returned")),
- (ARTICLE_EVENT_TYPE_DATA_ERROR, _("Data Error")),
-]
diff --git a/tracker/exceptions.py b/tracker/exceptions.py
index 31ed8c8..9ef3267 100644
--- a/tracker/exceptions.py
+++ b/tracker/exceptions.py
@@ -1,26 +1,2 @@
-class ProcEventCreateError(Exception):
- ...
-
-class UnexpectedEventCreateError(Exception):
- ...
-
-class EventCreateError(Exception):
- ...
-
-class EventReportCreateError(Exception):
- ...
-
-class EventReportSaveFileError(Exception):
- ...
-
-class EventReportCreateError(Exception):
- ...
-
-class EventReportDeleteEventsError(Exception):
- ...
-
class LogFileDiscardedLineCreateError(Exception):
...
-
-class ArticleEventError(Exception):
- ...
diff --git a/tracker/migrations/0001_initial.py b/tracker/migrations/0001_initial.py
index f207722..04fdc35 100644
--- a/tracker/migrations/0001_initial.py
+++ b/tracker/migrations/0001_initial.py
@@ -1,13 +1,18 @@
-# Generated by Django 5.0.7 on 2024-08-30 00:52
+# Generated by Codex on 2026-04-27
+import django.db.models.deletion
import uuid
+from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
- dependencies = []
+ dependencies = [
+ ("log_manager", "0001_initial"),
+ migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+ ]
operations = [
migrations.CreateModel(
@@ -24,21 +29,15 @@ class Migration(migrations.Migration):
),
(
"created",
- models.DateTimeField(
- auto_now_add=True, verbose_name="Creation date"
- ),
+ models.DateTimeField(auto_now_add=True, verbose_name="Creation date"),
),
(
"exception_type",
- models.TextField(
- blank=True, null=True, verbose_name="Exception Type"
- ),
+ models.TextField(blank=True, null=True, verbose_name="Exception Type"),
),
(
"exception_msg",
- models.TextField(
- blank=True, null=True, verbose_name="Exception Msg"
- ),
+ models.TextField(blank=True, null=True, verbose_name="Exception Msg"),
),
("traceback", models.JSONField(blank=True, null=True)),
("detail", models.JSONField(blank=True, null=True)),
@@ -46,9 +45,148 @@ class Migration(migrations.Migration):
options={
"indexes": [
models.Index(
- fields=["exception_type"], name="tracker_une_excepti_47ede4_idx"
+ fields=["exception_type"],
+ name="tracker_une_excepti_47ede4_idx",
)
],
},
),
+ migrations.CreateModel(
+ name="ArticleEvent",
+ fields=[
+ (
+ "id",
+ models.BigAutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ (
+ "created",
+ models.DateTimeField(auto_now_add=True, verbose_name="Creation date"),
+ ),
+ (
+ "updated",
+ models.DateTimeField(auto_now=True, verbose_name="Last update date"),
+ ),
+ (
+ "event_type",
+ models.CharField(
+ blank=True,
+ choices=[
+ ("MUL", "Multiple Articles Returned"),
+ ("ERR", "Data Error"),
+ ],
+ max_length=3,
+ null=True,
+ verbose_name="Event Type",
+ ),
+ ),
+ (
+ "message",
+ models.TextField(blank=True, null=True, verbose_name="Message"),
+ ),
+ ("data", models.JSONField(default=dict, verbose_name="Data")),
+ ("handled", models.BooleanField(default=False, verbose_name="Handled")),
+ (
+ "creator",
+ models.ForeignKey(
+ editable=False,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="%(class)s_creator",
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Creator",
+ ),
+ ),
+ (
+ "updated_by",
+ models.ForeignKey(
+ blank=True,
+ editable=False,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="%(class)s_last_mod_user",
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Updater",
+ ),
+ ),
+ ],
+ ),
+ migrations.CreateModel(
+ name="LogFileDiscardedLine",
+ fields=[
+ (
+ "id",
+ models.BigAutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ (
+ "created",
+ models.DateTimeField(auto_now_add=True, verbose_name="Creation date"),
+ ),
+ (
+ "updated",
+ models.DateTimeField(auto_now=True, verbose_name="Last update date"),
+ ),
+ (
+ "error_type",
+ models.CharField(
+ blank=True,
+ choices=[
+ ("MET", "Missing Metadata"),
+ ("DOC", "Missing Document"),
+ ("SRC", "Missing Source"),
+ ("URL", "URL Translation"),
+ ("DBE", "Database Error"),
+ ],
+ max_length=3,
+ null=True,
+ verbose_name="Error Type",
+ ),
+ ),
+ ("data", models.JSONField(default=dict, verbose_name="Data")),
+ (
+ "message",
+ models.TextField(blank=True, null=True, verbose_name="Message"),
+ ),
+ ("handled", models.BooleanField(default=False, verbose_name="Handled")),
+ (
+ "creator",
+ models.ForeignKey(
+ editable=False,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="%(class)s_creator",
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Creator",
+ ),
+ ),
+ (
+ "log_file",
+ models.ForeignKey(
+ on_delete=django.db.models.deletion.CASCADE,
+ to="log_manager.logfile",
+ ),
+ ),
+ (
+ "updated_by",
+ models.ForeignKey(
+ blank=True,
+ editable=False,
+ null=True,
+ on_delete=django.db.models.deletion.SET_NULL,
+ related_name="%(class)s_last_mod_user",
+ to=settings.AUTH_USER_MODEL,
+ verbose_name="Updater",
+ ),
+ ),
+ ],
+ ),
]
diff --git a/tracker/migrations/0002_remove_articleevent_creator_and_more.py b/tracker/migrations/0002_remove_articleevent_creator_and_more.py
new file mode 100644
index 0000000..ee23c85
--- /dev/null
+++ b/tracker/migrations/0002_remove_articleevent_creator_and_more.py
@@ -0,0 +1,38 @@
+# Generated by Django 5.2.12 on 2026-05-01 22:23
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("tracker", "0001_initial"),
+ ]
+
+ operations = [
+ migrations.RemoveField(
+ model_name="articleevent",
+ name="creator",
+ ),
+ migrations.RemoveField(
+ model_name="articleevent",
+ name="updated_by",
+ ),
+ migrations.DeleteModel(
+ name="UnexpectedEvent",
+ ),
+ migrations.RemoveField(
+ model_name="logfilediscardedline",
+ name="creator",
+ ),
+ migrations.RemoveField(
+ model_name="logfilediscardedline",
+ name="updated",
+ ),
+ migrations.RemoveField(
+ model_name="logfilediscardedline",
+ name="updated_by",
+ ),
+ migrations.DeleteModel(
+ name="ArticleEvent",
+ ),
+ ]
diff --git a/tracker/migrations/0002_top100articlesfileevent.py b/tracker/migrations/0002_top100articlesfileevent.py
deleted file mode 100644
index 230fb8a..0000000
--- a/tracker/migrations/0002_top100articlesfileevent.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Generated by Django 5.0.7 on 2024-08-30 21:52
-
-import django.db.models.deletion
-from django.conf import settings
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("metrics", "0002_alter_top100articlesfile_status"),
- ("tracker", "0001_initial"),
- migrations.swappable_dependency(settings.AUTH_USER_MODEL),
- ]
-
- operations = [
- migrations.CreateModel(
- name="Top100ArticlesFileEvent",
- fields=[
- (
- "id",
- models.BigAutoField(
- auto_created=True,
- primary_key=True,
- serialize=False,
- verbose_name="ID",
- ),
- ),
- (
- "created",
- models.DateTimeField(
- auto_now_add=True, verbose_name="Creation date"
- ),
- ),
- (
- "updated",
- models.DateTimeField(
- auto_now=True, verbose_name="Last update date"
- ),
- ),
- (
- "status",
- models.CharField(
- blank=True, max_length=64, null=True, verbose_name="Status"
- ),
- ),
- (
- "lines",
- models.IntegerField(
- blank=True, default=0, null=True, verbose_name="Lines"
- ),
- ),
- (
- "message",
- models.TextField(blank=True, null=True, verbose_name="Message"),
- ),
- (
- "creator",
- models.ForeignKey(
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_creator",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Creator",
- ),
- ),
- (
- "file",
- models.ForeignKey(
- blank=True,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- to="metrics.top100articlesfile",
- ),
- ),
- (
- "updated_by",
- models.ForeignKey(
- blank=True,
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_last_mod_user",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Updater",
- ),
- ),
- ],
- options={
- "verbose_name_plural": "Top 100 Article File Events",
- },
- ),
- ]
diff --git a/tracker/migrations/0003_logfilediscardedline_delete_top100articlesfileevent.py b/tracker/migrations/0003_logfilediscardedline_delete_top100articlesfileevent.py
deleted file mode 100644
index 6e37a9f..0000000
--- a/tracker/migrations/0003_logfilediscardedline_delete_top100articlesfileevent.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Generated by Django 5.0.7 on 2025-03-07 16:55
-
-import django.db.models.deletion
-from django.conf import settings
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("log_manager", "0002_alter_collectionconfig_unique_together_and_more"),
- ("tracker", "0002_top100articlesfileevent"),
- migrations.swappable_dependency(settings.AUTH_USER_MODEL),
- ]
-
- operations = [
- migrations.CreateModel(
- name="LogFileDiscardedLine",
- fields=[
- (
- "id",
- models.BigAutoField(
- auto_created=True,
- primary_key=True,
- serialize=False,
- verbose_name="ID",
- ),
- ),
- (
- "created",
- models.DateTimeField(
- auto_now_add=True, verbose_name="Creation date"
- ),
- ),
- (
- "updated",
- models.DateTimeField(
- auto_now=True, verbose_name="Last update date"
- ),
- ),
- (
- "error_type",
- models.CharField(
- blank=True,
- choices=[
- ("MET", "Missing Metadata"),
- ("ART", "Missing Article"),
- ("JOU", "Missing Journal"),
- ],
- max_length=3,
- null=True,
- verbose_name="Error Type",
- ),
- ),
- ("data", models.JSONField(default=dict, verbose_name="Data")),
- (
- "message",
- models.TextField(blank=True, null=True, verbose_name="Message"),
- ),
- ("handled", models.BooleanField(default=False, verbose_name="Handled")),
- (
- "creator",
- models.ForeignKey(
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_creator",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Creator",
- ),
- ),
- (
- "log_file",
- models.ForeignKey(
- on_delete=django.db.models.deletion.CASCADE,
- to="log_manager.logfile",
- ),
- ),
- (
- "updated_by",
- models.ForeignKey(
- blank=True,
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_last_mod_user",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Updater",
- ),
- ),
- ],
- options={
- "abstract": False,
- },
- ),
- migrations.DeleteModel(
- name="Top100ArticlesFileEvent",
- ),
- ]
diff --git a/tracker/migrations/0004_alter_logfilediscardedline_error_type.py b/tracker/migrations/0004_alter_logfilediscardedline_error_type.py
deleted file mode 100644
index 1061793..0000000
--- a/tracker/migrations/0004_alter_logfilediscardedline_error_type.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Generated by Django 5.0.7 on 2025-03-27 20:40
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("tracker", "0003_logfilediscardedline_delete_top100articlesfileevent"),
- ]
-
- operations = [
- migrations.AlterField(
- model_name="logfilediscardedline",
- name="error_type",
- field=models.CharField(
- blank=True,
- choices=[
- ("MET", "Missing Metadata"),
- ("ART", "Missing Article"),
- ("JOU", "Missing Journal"),
- ("URL", "URL Translation"),
- ],
- max_length=3,
- null=True,
- verbose_name="Error Type",
- ),
- ),
- ]
diff --git a/tracker/migrations/0005_articleevent.py b/tracker/migrations/0005_articleevent.py
deleted file mode 100644
index 859910e..0000000
--- a/tracker/migrations/0005_articleevent.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Generated by Django 5.0.7 on 2025-05-23 17:27
-
-import django.db.models.deletion
-from django.conf import settings
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("tracker", "0004_alter_logfilediscardedline_error_type"),
- migrations.swappable_dependency(settings.AUTH_USER_MODEL),
- ]
-
- operations = [
- migrations.CreateModel(
- name="ArticleEvent",
- fields=[
- (
- "id",
- models.BigAutoField(
- auto_created=True,
- primary_key=True,
- serialize=False,
- verbose_name="ID",
- ),
- ),
- (
- "created",
- models.DateTimeField(
- auto_now_add=True, verbose_name="Creation date"
- ),
- ),
- (
- "updated",
- models.DateTimeField(
- auto_now=True, verbose_name="Last update date"
- ),
- ),
- (
- "event_type",
- models.CharField(
- blank=True,
- choices=[
- ("MUL", "Multiple Articles Returned"),
- ("ERR", "Data Error"),
- ],
- max_length=3,
- null=True,
- verbose_name="Event Type",
- ),
- ),
- (
- "message",
- models.TextField(blank=True, null=True, verbose_name="Message"),
- ),
- ("data", models.JSONField(default=dict, verbose_name="Data")),
- ("handled", models.BooleanField(default=False, verbose_name="Handled")),
- (
- "creator",
- models.ForeignKey(
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_creator",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Creator",
- ),
- ),
- (
- "updated_by",
- models.ForeignKey(
- blank=True,
- editable=False,
- null=True,
- on_delete=django.db.models.deletion.SET_NULL,
- related_name="%(class)s_last_mod_user",
- to=settings.AUTH_USER_MODEL,
- verbose_name="Updater",
- ),
- ),
- ],
- options={
- "abstract": False,
- },
- ),
- ]
diff --git a/tracker/migrations/0006_alter_logfilediscardedline_error_type.py b/tracker/migrations/0006_alter_logfilediscardedline_error_type.py
deleted file mode 100644
index fb7f74a..0000000
--- a/tracker/migrations/0006_alter_logfilediscardedline_error_type.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Generated by Django 5.0.7 on 2025-06-14 10:46
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("tracker", "0005_articleevent"),
- ]
-
- operations = [
- migrations.AlterField(
- model_name="logfilediscardedline",
- name="error_type",
- field=models.CharField(
- blank=True,
- choices=[
- ("MET", "Missing Metadata"),
- ("ART", "Missing Article"),
- ("JOU", "Missing Journal"),
- ("URL", "URL Translation"),
- ("DBE", "Database Error"),
- ],
- max_length=3,
- null=True,
- verbose_name="Error Type",
- ),
- ),
- ]
diff --git a/tracker/migrations/0007_alter_logfilediscardedline_error_type.py b/tracker/migrations/0007_alter_logfilediscardedline_error_type.py
deleted file mode 100644
index f9ffebe..0000000
--- a/tracker/migrations/0007_alter_logfilediscardedline_error_type.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Generated by Django 5.0.7 on 2025-08-09 21:04
-
-from django.db import migrations, models
-
-
-class Migration(migrations.Migration):
- dependencies = [
- ("tracker", "0006_alter_logfilediscardedline_error_type"),
- ]
-
- operations = [
- migrations.AlterField(
- model_name="logfilediscardedline",
- name="error_type",
- field=models.CharField(
- blank=True,
- choices=[
- ("MET", "Missing Metadata"),
- ("ART", "Missing PIDv2 or PIDv3 or PID Generic"),
- ("JOU", "Missing ISSN"),
- ("URL", "URL Translation"),
- ("DBE", "Database Error"),
- ],
- max_length=3,
- null=True,
- verbose_name="Error Type",
- ),
- ),
- ]
diff --git a/tracker/models.py b/tracker/models.py
index 77086ee..a394ed6 100644
--- a/tracker/models.py
+++ b/tracker/models.py
@@ -1,65 +1,13 @@
-import json
-import logging
-import traceback
-import uuid
-
-from datetime import datetime
-
-from django.core.files.base import ContentFile
from django.db import models
from django.utils.translation import gettext_lazy as _
-from core.models import CommonControlField
from log_manager.models import LogFile
from tracker import choices
-
-from .exceptions import *
+from .exceptions import LogFileDiscardedLineCreateError
-class ArticleEvent(CommonControlField):
- event_type = models.CharField(
- _("Event Type"),
- choices=choices.ARTICLE_EVENT_TYPE,
- max_length=3,
- null=True,
- blank=True,
- )
-
- message = models.TextField(
- _("Message"),
- null=True,
- blank=True,
- )
-
- data = models.JSONField(
- _("Data"),
- default=dict,
- )
-
- handled = models.BooleanField(
- _("Handled"),
- default=False
- )
-
- @classmethod
- def create(cls, event_type, message, data):
- try:
- obj = cls()
- obj.event_type = event_type
- obj.message = message
- obj.data = data
- obj.save()
- except Exception as exc:
- raise ArticleEventError(
- f"Unable to create ArticleEvent ({data} - {event_type} - {message}). EXCEPTION {exc}"
- )
- return obj
-
- def __str__(self):
- return f"{self.event_type} - {self.message}"
-
-
-class LogFileDiscardedLine(CommonControlField):
+class LogFileDiscardedLine(models.Model):
+ created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True)
log_file = models.ForeignKey(
LogFile,
on_delete=models.CASCADE,
@@ -108,174 +56,4 @@ def __str__(self):
return f"{self.data} - {self.message}"
-class UnexpectedEvent(models.Model):
- id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
- created = models.DateTimeField(verbose_name=_("Creation date"), auto_now_add=True)
- exception_type = models.TextField(_("Exception Type"), null=True, blank=True)
- exception_msg = models.TextField(_("Exception Msg"), null=True, blank=True)
- traceback = models.JSONField(null=True, blank=True)
- detail = models.JSONField(null=True, blank=True)
-
- class Meta:
- indexes = [
- models.Index(fields=["exception_type"]),
- ]
-
- def __str__(self):
- return f"{self.exception_msg}"
-
- @property
- def data(self):
- return dict(
- created=self.created.isoformat(),
- exception_type=self.exception_type,
- exception_msg=self.exception_msg,
- traceback=json.dumps(self.traceback),
- detail=json.dumps(self.detail),
- )
-
- @classmethod
- def create(
- cls,
- exception=None,
- exc_traceback=None,
- detail=None,
- ):
- try:
- if exception:
- logging.exception(exception)
-
- obj = cls()
- obj.exception_msg = str(exception)
- obj.exception_type = str(type(exception))
- try:
- json.dumps(detail)
- obj.detail = detail
- except Exception as e:
- obj.detail = str(detail)
- if exc_traceback:
- obj.traceback = traceback.format_tb(exc_traceback)
- obj.save()
- return obj
- except Exception as exc:
- raise UnexpectedEventCreateError(
- f"Unable to create unexpected event ({exception} {exc_traceback}). EXCEPTION {exc}"
- )
-
-
-class Event(CommonControlField):
- id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
- message = models.TextField(_("Message"), null=True, blank=True)
- message_type = models.CharField(
- _("Message type"),
- choices=choices.EVENT_MSG_TYPE,
- max_length=16,
- null=True,
- blank=True,
- )
- detail = models.JSONField(null=True, blank=True)
- unexpected_event = models.ForeignKey(
- 'UnexpectedEvent', on_delete=models.SET_NULL, null=True, blank=True
- )
-
- class Meta:
- abstract = True
- indexes = [
- models.Index(fields=["message_type"]),
- ]
-
- @property
- def data(self):
- d = {}
- d["created"] = self.created.isoformat()
- d["user"] = self.user.username
- d.update(
- dict(
- message=self.message, message_type=self.message_type, detail=self.detail
- )
- )
- if self.unexpected_event:
- d.update(self.unexpected_event.data)
- return d
-
- @classmethod
- def create(
- cls,
- user=None,
- message_type=None,
- message=None,
- e=None,
- exc_traceback=None,
- detail=None,
- ):
- try:
- obj = cls()
- obj.creator = user
- obj.message = message
- obj.message_type = message_type
- obj.detail = detail
- obj.save()
-
- if e:
- logging.exception(f"{message}: {e}")
- obj.unexpected_event = UnexpectedEvent.create(
- exception=e,
- exc_traceback=exc_traceback,
- )
- obj.save()
- except Exception as exc:
- raise EventCreateError(
- f"Unable to create Event ({message} {e}). EXCEPTION: {exc}"
- )
- return obj
-
-
-def tracker_file_directory_path(instance, filename):
- d = datetime.now(datetime.timezone.utc)
- return f"tracker/{d.year}/{d.month}/{d.day}/{filename}"
-
-
-class EventReport(CommonControlField):
- file = models.FileField(
- upload_to=tracker_file_directory_path, null=True, blank=True
- )
-
- class Meta:
- abstract = True
-
- def save_file(self, events, ext=None):
- if not events:
- return
- try:
- ext = ".json"
- content = json.dumps(list([item.data for item in events]))
- name = datetime.now(datetime.timezone.utc).isoformat() + ext
- self.file.save(name, ContentFile(content))
- self.delete_events(events)
- except Exception as e:
- raise EventReportSaveFileError(
- f"Unable to save EventReport.file ({name}). Exception: {e}"
- )
-
- def delete_events(self, events):
- for item in events:
- try:
- item.unexpected_event.delete()
- except:
- pass
- try:
- item.delete()
- except:
- pass
-
- @classmethod
- def create(cls, user):
- try:
- obj = cls()
- obj.creator = user
- obj.save()
- except Exception as e:
- raise EventReportCreateError(
- f"Unable to create EventReport. Exception: {e}"
- )
diff --git a/tracker/tasks.py b/tracker/tasks.py
deleted file mode 100644
index ace8145..0000000
--- a/tracker/tasks.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# tasks.py
-from datetime import datetime
-
-from django.contrib.auth import get_user_model
-
-from config import celery_app
-from core.utils.utils import _get_user
-
-from .models import UnexpectedEvent
-
-
-User = get_user_model()
-
-
-@celery_app.task(bind=True, name="Cleanup unexpected events")
-def delete_unexpected_events(self, exception_type, start_date=None, end_date=None, user_id=None, username=None):
- """
- Delete UnexpectedEvent records based on exception type and optional date range.
- """
- user = _get_user(self.request, username=username, user_id=user_id)
-
- if exception_type == '__all__':
- UnexpectedEvent.objects.all().delete()
- return
-
- filters = {'exception_type__icontains': exception_type}
- if start_date:
- start_date = datetime.fromisoformat(start_date)
- filters['created__gte'] = start_date
- if end_date:
- end_date = datetime.fromisoformat(end_date)
- filters['created__lte'] = end_date
-
- UnexpectedEvent.objects.filter(**filters).delete()
diff --git a/tracker/wagtail_hooks.py b/tracker/wagtail_hooks.py
index ce1b30f..1ceb9c7 100644
--- a/tracker/wagtail_hooks.py
+++ b/tracker/wagtail_hooks.py
@@ -4,35 +4,9 @@
from config.menu import get_menu_order
-from .models import UnexpectedEvent, LogFileDiscardedLine, ArticleEvent
+from .models import LogFileDiscardedLine
-class UnexpectedEventSnippetViewSet(SnippetViewSet):
- model = UnexpectedEvent
- menu_label = _("Unexpected Events")
- icon = 'warning'
- menu_order = get_menu_order("tracker")
- add_to_admin_menu = False
-
- list_display = (
- "exception_type",
- "exception_msg",
- "traceback",
- "created",
- )
- list_filter = ("exception_type",)
- search_fields = (
- "exception_msg",
- "detail",
- )
- inspect_view_fields = (
- "exception_type",
- "exception_msg",
- "traceback",
- "detail",
- "created",
- )
-
class LogFileDiscardedLineSnippetViewSet(SnippetViewSet):
model = LogFileDiscardedLine
menu_label = _("Discarded Lines")
@@ -64,34 +38,7 @@ class LogFileDiscardedLineSnippetViewSet(SnippetViewSet):
"handled",
)
-class ArticleEventSnippetViewSet(SnippetViewSet):
- model = ArticleEvent
- menu_label = _("Article Events")
- icon = 'warning'
- menu_order = get_menu_order("tracker")
- add_to_admin_menu = False
-
- list_display = (
- "event_type",
- "message",
- "data",
- "handled",
- )
-
- list_filter = (
- "event_type",
- "handled",
- )
- search_fields = (
- "message",
- )
- inspect_view_fields = (
- "event_type",
- "message",
- "data",
- "handled",
- )
class TrackerSnippetViewSetGroup(SnippetViewSetGroup):
@@ -101,9 +48,7 @@ class TrackerSnippetViewSetGroup(SnippetViewSetGroup):
menu_order = get_menu_order("tracker")
items = (
- UnexpectedEventSnippetViewSet,
LogFileDiscardedLineSnippetViewSet,
- ArticleEventSnippetViewSet,
)